diff --git "a/experiment_config.json" "b/experiment_config.json" new file mode 100644--- /dev/null +++ "b/experiment_config.json" @@ -0,0 +1,231153 @@ +{ + "training_args": { + "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2", + "overwrite_output_dir": false, + "do_train": false, + "do_eval": true, + "do_predict": false, + "eval_strategy": "steps", + "prediction_loss_only": false, + "per_device_train_batch_size": 4, + "per_device_eval_batch_size": 8, + "per_gpu_train_batch_size": null, + "per_gpu_eval_batch_size": null, + "gradient_accumulation_steps": 4, + "eval_accumulation_steps": null, + "eval_delay": 0, + "torch_empty_cache_steps": null, + "learning_rate": 2e-05, + "weight_decay": 0.0, + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-08, + "max_grad_norm": 1.0, + "num_train_epochs": 3, + "max_steps": -1, + "lr_scheduler_type": "linear", + "lr_scheduler_kwargs": {}, + "warmup_ratio": 0.0, + "warmup_steps": 0, + "log_level": "passive", + "log_level_replica": "warning", + "log_on_each_node": true, + "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2/runs/Sep04_03-44-32_gx12", + "logging_strategy": "steps", + "logging_first_step": false, + "logging_steps": 20, + "logging_nan_inf_filter": true, + "save_strategy": "epoch", + "save_steps": 40, + "save_total_limit": null, + "save_safetensors": true, + "save_on_each_node": false, + "save_only_model": false, + "restore_callback_states_from_checkpoint": false, + "no_cuda": false, + "use_cpu": false, + "use_mps_device": false, + "seed": 42, + "data_seed": null, + "jit_mode_eval": false, + "use_ipex": false, + "bf16": false, + "fp16": false, + "fp16_opt_level": "O1", + "half_precision_backend": "auto", + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "local_rank": 0, + "ddp_backend": null, + "tpu_num_cores": null, + "tpu_metrics_debug": false, + "debug": [], + "dataloader_drop_last": false, + "eval_steps": 20, + "dataloader_num_workers": 0, + "dataloader_prefetch_factor": null, + "past_index": -1, + "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/qa_quoref_answer_generation_lora_v2", + "disable_tqdm": false, + "remove_unused_columns": true, + "label_names": null, + "load_best_model_at_end": false, + "metric_for_best_model": null, + "greater_is_better": null, + "ignore_data_skip": false, + "fsdp": [], + "fsdp_min_num_params": 0, + "fsdp_config": { + "min_num_params": 0, + "xla": false, + "xla_fsdp_v2": false, + "xla_fsdp_grad_ckpt": false + }, + "fsdp_transformer_layer_cls_to_wrap": null, + "accelerator_config": { + "split_batches": false, + "dispatch_batches": null, + "even_batches": true, + "use_seedable_sampler": true, + "non_blocking": false, + "gradient_accumulation_kwargs": null + }, + "deepspeed": null, + "label_smoothing_factor": 0.0, + "optim": "adamw_torch", + "optim_args": null, + "adafactor": false, + "group_by_length": false, + "length_column_name": "length", + "report_to": [], + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "skip_memory_metrics": true, + "use_legacy_prediction_loop": false, + "push_to_hub": false, + "resume_from_checkpoint": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_token": "", + "hub_private_repo": null, + "hub_always_push": false, + "gradient_checkpointing": false, + "gradient_checkpointing_kwargs": null, + "include_inputs_for_metrics": false, + "include_for_metrics": [], + "eval_do_concat_batches": true, + "fp16_backend": "auto", + "push_to_hub_model_id": null, + "push_to_hub_organization": null, + "push_to_hub_token": "", + "mp_parameters": "", + "auto_find_batch_size": false, + "full_determinism": false, + "torchdynamo": null, + "ray_scope": "last", + "ddp_timeout": 1800, + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "include_tokens_per_second": false, + "include_num_input_tokens_seen": false, + "neftune_noise_alpha": null, + "optim_target_modules": null, + "batch_eval_metrics": false, + "eval_on_start": false, + "use_liger_kernel": false, + "eval_use_gather_object": false, + "average_tokens_across_devices": false + }, + "lora_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "revision": null, + "inference_mode": false, + "r": 16, + "target_modules": [ + "o_proj", + "down_proj", + "v_proj", + "up_proj", + "q_proj", + "gate_proj", + "k_proj" + ], + "exclude_modules": null, + "lora_alpha": 16, + "lora_dropout": 0.1, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": true, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "runtime_config": { + "ephemeral_gpu_offload": false + }, + "lora_bias": false + }, + "flops": { + "eval": 224996302651284480, + "train": 5.201887604744794e+16, + "total": 2.770151786987324e+17 + }, + "total_energy": 80.49119, + "logs": [ + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:44:54.030340", + "step": 0, + "epoch": 0 + }, + { + "type": "pplx", + "content": 513.7015351300394, + "timestamp": "2025-09-04 03:44:54.032582", + "step": 0, + "epoch": 0 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:44:54.153228", + "step": 0, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22628124058246613, + "timestamp": "2025-09-04 03:44:54.166141", + "step": 1, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:44:54.277663", + "step": 1, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.18759363889694214, + "timestamp": "2025-09-04 03:44:54.296326", + "step": 2, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:44:54.403899", + "step": 2, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.21276673674583435, + "timestamp": "2025-09-04 03:44:54.422364", + "step": 3, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:44:54.529920", + "step": 3, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.20896945893764496, + "timestamp": "2025-09-04 03:44:54.584227", + "step": 4, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:44:54.682359", + "step": 4, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15049788355827332, + "timestamp": "2025-09-04 03:44:54.701147", + "step": 5, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:44:54.795660", + "step": 5, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.221171572804451, + "timestamp": "2025-09-04 03:44:54.812689", + "step": 6, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:44:54.925572", + "step": 6, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.13799867033958435, + "timestamp": "2025-09-04 03:44:54.945627", + "step": 7, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:44:55.048805", + "step": 7, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.254111111164093, + "timestamp": "2025-09-04 03:44:55.068403", + "step": 8, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:44:55.176185", + "step": 8, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2516261041164398, + "timestamp": "2025-09-04 03:44:55.197753", + "step": 9, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:44:55.292267", + "step": 9, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11826080828905106, + "timestamp": "2025-09-04 03:44:55.309480", + "step": 10, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:44:55.392164", + "step": 10, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.3149326741695404, + "timestamp": "2025-09-04 03:44:55.404837", + "step": 11, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:44:55.501241", + "step": 11, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1865367889404297, + "timestamp": "2025-09-04 03:44:55.519423", + "step": 12, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:44:55.618360", + "step": 12, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.22903558611869812, + "timestamp": "2025-09-04 03:44:55.638547", + "step": 13, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:44:55.727492", + "step": 13, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1957457810640335, + "timestamp": "2025-09-04 03:44:55.743023", + "step": 14, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:44:55.850708", + "step": 14, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26458948850631714, + "timestamp": "2025-09-04 03:44:55.870711", + "step": 15, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:44:55.956573", + "step": 15, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.19251839816570282, + "timestamp": "2025-09-04 03:44:55.972768", + "step": 16, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:44:56.065326", + "step": 16, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.169407457113266, + "timestamp": "2025-09-04 03:44:56.084337", + "step": 17, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:44:56.164607", + "step": 17, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1738697588443756, + "timestamp": "2025-09-04 03:44:56.178386", + "step": 18, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:44:56.250786", + "step": 18, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12069570273160934, + "timestamp": "2025-09-04 03:44:56.263511", + "step": 19, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:44:56.374549", + "step": 19, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.28613629937171936, + "timestamp": "2025-09-04 03:44:56.395709", + "step": 20, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:04.776963", + "step": 20, + "epoch": 1 + }, + { + "type": "pplx", + "content": 459.69697261034077, + "timestamp": "2025-09-04 03:45:04.779321", + "step": 20, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:04.883567", + "step": 20, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0930766835808754, + "timestamp": "2025-09-04 03:45:04.905592", + "step": 21, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1392 + ], + "flops": 27840169073088.0 + }, + "timestamp": "2025-09-04 03:45:05.111656", + "step": 21, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.16422075033187866, + "timestamp": "2025-09-04 03:45:05.150856", + "step": 22, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:45:05.262285", + "step": 22, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15494759380817413, + "timestamp": "2025-09-04 03:45:05.282894", + "step": 23, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 03:45:05.496961", + "step": 23, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1941477507352829, + "timestamp": "2025-09-04 03:45:05.538559", + "step": 24, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:05.640198", + "step": 24, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.046016380190849304, + "timestamp": "2025-09-04 03:45:05.661299", + "step": 25, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:05.770082", + "step": 25, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.26754552125930786, + "timestamp": "2025-09-04 03:45:05.789791", + "step": 26, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:05.887422", + "step": 26, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2157849818468094, + "timestamp": "2025-09-04 03:45:05.903902", + "step": 27, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:06.003259", + "step": 27, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09500601142644882, + "timestamp": "2025-09-04 03:45:06.022367", + "step": 28, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:06.127436", + "step": 28, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2806503176689148, + "timestamp": "2025-09-04 03:45:06.149677", + "step": 29, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:45:06.261111", + "step": 29, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.147545725107193, + "timestamp": "2025-09-04 03:45:06.281302", + "step": 30, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:06.383742", + "step": 30, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0946376770734787, + "timestamp": "2025-09-04 03:45:06.402770", + "step": 31, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:06.498965", + "step": 31, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07105350494384766, + "timestamp": "2025-09-04 03:45:06.516556", + "step": 32, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:45:06.595117", + "step": 32, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05865024775266647, + "timestamp": "2025-09-04 03:45:06.609671", + "step": 33, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:06.700887", + "step": 33, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04186881706118584, + "timestamp": "2025-09-04 03:45:06.717379", + "step": 34, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:06.820494", + "step": 34, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1130460873246193, + "timestamp": "2025-09-04 03:45:06.839402", + "step": 35, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:06.935561", + "step": 35, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08032827824354172, + "timestamp": "2025-09-04 03:45:06.953667", + "step": 36, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:07.060582", + "step": 36, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10274317115545273, + "timestamp": "2025-09-04 03:45:07.082672", + "step": 37, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:07.192812", + "step": 37, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05926857143640518, + "timestamp": "2025-09-04 03:45:07.212899", + "step": 38, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:45:07.301385", + "step": 38, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09572148323059082, + "timestamp": "2025-09-04 03:45:07.316491", + "step": 39, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:45:07.394036", + "step": 39, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06614067405462265, + "timestamp": "2025-09-04 03:45:07.408562", + "step": 40, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:15.977735", + "step": 40, + "epoch": 1 + }, + { + "type": "pplx", + "content": 389.98537419869405, + "timestamp": "2025-09-04 03:45:15.980248", + "step": 40, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 40", + "timestamp": "2025-09-04 03:45:16.522945", + "step": 40, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:45:16.607576", + "step": 40, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0893501564860344, + "timestamp": "2025-09-04 03:45:16.624424", + "step": 41, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:45:16.725320", + "step": 41, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10664358735084534, + "timestamp": "2025-09-04 03:45:16.743948", + "step": 42, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:16.850040", + "step": 42, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11798786371946335, + "timestamp": "2025-09-04 03:45:16.869875", + "step": 43, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:45:16.959614", + "step": 43, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09886734187602997, + "timestamp": "2025-09-04 03:45:16.975202", + "step": 44, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:45:17.049375", + "step": 44, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040978167206048965, + "timestamp": "2025-09-04 03:45:17.064184", + "step": 45, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:17.173784", + "step": 45, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06903672963380814, + "timestamp": "2025-09-04 03:45:17.193812", + "step": 46, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:17.296985", + "step": 46, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02300487831234932, + "timestamp": "2025-09-04 03:45:17.316264", + "step": 47, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:17.408720", + "step": 47, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0908779725432396, + "timestamp": "2025-09-04 03:45:17.426033", + "step": 48, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:17.523711", + "step": 48, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08729575574398041, + "timestamp": "2025-09-04 03:45:17.543919", + "step": 49, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:17.637324", + "step": 49, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0870480164885521, + "timestamp": "2025-09-04 03:45:17.654175", + "step": 50, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:17.756185", + "step": 50, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.050713177770376205, + "timestamp": "2025-09-04 03:45:17.775418", + "step": 51, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:45:17.857774", + "step": 51, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04926443472504616, + "timestamp": "2025-09-04 03:45:17.873580", + "step": 52, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:17.976932", + "step": 52, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014587458223104477, + "timestamp": "2025-09-04 03:45:17.998752", + "step": 53, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:18.096464", + "step": 53, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0563230998814106, + "timestamp": "2025-09-04 03:45:18.113944", + "step": 54, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:18.204053", + "step": 54, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.052430395036935806, + "timestamp": "2025-09-04 03:45:18.220905", + "step": 55, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:45:18.299562", + "step": 55, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08462968468666077, + "timestamp": "2025-09-04 03:45:18.314267", + "step": 56, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:45:18.410745", + "step": 56, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06088142469525337, + "timestamp": "2025-09-04 03:45:18.431360", + "step": 57, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:18.531885", + "step": 57, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10583172738552094, + "timestamp": "2025-09-04 03:45:18.550360", + "step": 58, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:18.645711", + "step": 58, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05197317525744438, + "timestamp": "2025-09-04 03:45:18.663022", + "step": 59, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:18.767990", + "step": 59, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0938020795583725, + "timestamp": "2025-09-04 03:45:18.788566", + "step": 60, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:27.161836", + "step": 60, + "epoch": 1 + }, + { + "type": "pplx", + "content": 344.9719250300418, + "timestamp": "2025-09-04 03:45:27.164144", + "step": 60, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:27.256830", + "step": 60, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047754935920238495, + "timestamp": "2025-09-04 03:45:27.275797", + "step": 61, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:45:27.384552", + "step": 61, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08348787575960159, + "timestamp": "2025-09-04 03:45:27.404855", + "step": 62, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:27.496674", + "step": 62, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048718009144067764, + "timestamp": "2025-09-04 03:45:27.513341", + "step": 63, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 03:45:27.732444", + "step": 63, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06368561834096909, + "timestamp": "2025-09-04 03:45:27.775457", + "step": 64, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:45:27.866602", + "step": 64, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047636643052101135, + "timestamp": "2025-09-04 03:45:27.885576", + "step": 65, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:27.994542", + "step": 65, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09618725627660751, + "timestamp": "2025-09-04 03:45:28.014601", + "step": 66, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 03:45:28.146160", + "step": 66, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09339220821857452, + "timestamp": "2025-09-04 03:45:28.169472", + "step": 67, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:28.276347", + "step": 67, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05055764317512512, + "timestamp": "2025-09-04 03:45:28.296999", + "step": 68, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:45:28.379869", + "step": 68, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06613060086965561, + "timestamp": "2025-09-04 03:45:28.396302", + "step": 69, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:28.498772", + "step": 69, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08002032339572906, + "timestamp": "2025-09-04 03:45:28.517928", + "step": 70, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:45:28.595425", + "step": 70, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09714193642139435, + "timestamp": "2025-09-04 03:45:28.609410", + "step": 71, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:28.716648", + "step": 71, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048158854246139526, + "timestamp": "2025-09-04 03:45:28.737622", + "step": 72, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:28.833604", + "step": 72, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.17001864314079285, + "timestamp": "2025-09-04 03:45:28.854016", + "step": 73, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:45:28.963395", + "step": 73, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025268767029047012, + "timestamp": "2025-09-04 03:45:28.983767", + "step": 74, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:29.089863", + "step": 74, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025647073984146118, + "timestamp": "2025-09-04 03:45:29.109647", + "step": 75, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:45:29.193050", + "step": 75, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06310443580150604, + "timestamp": "2025-09-04 03:45:29.208797", + "step": 76, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:45:29.289171", + "step": 76, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006233109161257744, + "timestamp": "2025-09-04 03:45:29.305528", + "step": 77, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:29.403738", + "step": 77, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03743808716535568, + "timestamp": "2025-09-04 03:45:29.422112", + "step": 78, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:45:29.532220", + "step": 78, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06183940917253494, + "timestamp": "2025-09-04 03:45:29.552634", + "step": 79, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:29.658591", + "step": 79, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03081091120839119, + "timestamp": "2025-09-04 03:45:29.679007", + "step": 80, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:38.054098", + "step": 80, + "epoch": 1 + }, + { + "type": "pplx", + "content": 318.24303809489555, + "timestamp": "2025-09-04 03:45:38.056443", + "step": 80, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 80", + "timestamp": "2025-09-04 03:45:38.556351", + "step": 80, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:38.660607", + "step": 80, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0675966814160347, + "timestamp": "2025-09-04 03:45:38.682606", + "step": 81, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:38.778116", + "step": 81, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07781155407428741, + "timestamp": "2025-09-04 03:45:38.795394", + "step": 82, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:38.899758", + "step": 82, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06105361878871918, + "timestamp": "2025-09-04 03:45:38.918706", + "step": 83, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:39.026173", + "step": 83, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.049994468688964844, + "timestamp": "2025-09-04 03:45:39.046706", + "step": 84, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:39.138861", + "step": 84, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1184161901473999, + "timestamp": "2025-09-04 03:45:39.157849", + "step": 85, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:45:39.243936", + "step": 85, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07634230703115463, + "timestamp": "2025-09-04 03:45:39.259348", + "step": 86, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:39.361762", + "step": 86, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03923124074935913, + "timestamp": "2025-09-04 03:45:39.380754", + "step": 87, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:45:39.458872", + "step": 87, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0754866749048233, + "timestamp": "2025-09-04 03:45:39.473629", + "step": 88, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:45:39.542590", + "step": 88, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.049534834921360016, + "timestamp": "2025-09-04 03:45:39.556461", + "step": 89, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:45:39.634576", + "step": 89, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09451133012771606, + "timestamp": "2025-09-04 03:45:39.648463", + "step": 90, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:39.750807", + "step": 90, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1034846305847168, + "timestamp": "2025-09-04 03:45:39.770033", + "step": 91, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:45:39.855459", + "step": 91, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.060452286154031754, + "timestamp": "2025-09-04 03:45:39.871436", + "step": 92, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:39.961786", + "step": 92, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05689983442425728, + "timestamp": "2025-09-04 03:45:39.980357", + "step": 93, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:45:40.109208", + "step": 93, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009844101965427399, + "timestamp": "2025-09-04 03:45:40.132173", + "step": 94, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:45:40.214872", + "step": 94, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08469025045633316, + "timestamp": "2025-09-04 03:45:40.229769", + "step": 95, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:40.328739", + "step": 95, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10152354091405869, + "timestamp": "2025-09-04 03:45:40.347831", + "step": 96, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:45:40.445260", + "step": 96, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0582207627594471, + "timestamp": "2025-09-04 03:45:40.465784", + "step": 97, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:45:40.580598", + "step": 97, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05192510411143303, + "timestamp": "2025-09-04 03:45:40.600367", + "step": 98, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:40.700989", + "step": 98, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09743145853281021, + "timestamp": "2025-09-04 03:45:40.719330", + "step": 99, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:40.815781", + "step": 99, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07235053181648254, + "timestamp": "2025-09-04 03:45:40.833849", + "step": 100, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:49.188649", + "step": 100, + "epoch": 1 + }, + { + "type": "pplx", + "content": 307.3854100186044, + "timestamp": "2025-09-04 03:45:49.190883", + "step": 100, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:45:49.324428", + "step": 100, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04918394982814789, + "timestamp": "2025-09-04 03:45:49.353243", + "step": 101, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:45:49.444040", + "step": 101, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12214594334363937, + "timestamp": "2025-09-04 03:45:49.460765", + "step": 102, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:45:49.561386", + "step": 102, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.044912759214639664, + "timestamp": "2025-09-04 03:45:49.579949", + "step": 103, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:49.681534", + "step": 103, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.14099526405334473, + "timestamp": "2025-09-04 03:45:49.701325", + "step": 104, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:45:49.783498", + "step": 104, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0638093501329422, + "timestamp": "2025-09-04 03:45:49.800247", + "step": 105, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:45:49.904552", + "step": 105, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013638557866215706, + "timestamp": "2025-09-04 03:45:49.923592", + "step": 106, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:45:50.006927", + "step": 106, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07801222056150436, + "timestamp": "2025-09-04 03:45:50.022042", + "step": 107, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:45:50.144157", + "step": 107, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06590745598077774, + "timestamp": "2025-09-04 03:45:50.166685", + "step": 108, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:50.262360", + "step": 108, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11835378408432007, + "timestamp": "2025-09-04 03:45:50.282746", + "step": 109, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:50.376627", + "step": 109, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12223128229379654, + "timestamp": "2025-09-04 03:45:50.393565", + "step": 110, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:45:50.504278", + "step": 110, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1280670166015625, + "timestamp": "2025-09-04 03:45:50.524729", + "step": 111, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:45:50.623814", + "step": 111, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06618773937225342, + "timestamp": "2025-09-04 03:45:50.642958", + "step": 112, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:50.734068", + "step": 112, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04645576328039169, + "timestamp": "2025-09-04 03:45:50.752647", + "step": 113, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:45:50.829181", + "step": 113, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.13508065044879913, + "timestamp": "2025-09-04 03:45:50.842752", + "step": 114, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:45:50.934855", + "step": 114, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0594618059694767, + "timestamp": "2025-09-04 03:45:50.951746", + "step": 115, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:45:51.053664", + "step": 115, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11379887908697128, + "timestamp": "2025-09-04 03:45:51.073378", + "step": 116, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:45:51.164518", + "step": 116, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.039490919560194016, + "timestamp": "2025-09-04 03:45:51.183413", + "step": 117, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:45:51.291693", + "step": 117, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04585837572813034, + "timestamp": "2025-09-04 03:45:51.311758", + "step": 118, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:45:51.407070", + "step": 118, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07603704929351807, + "timestamp": "2025-09-04 03:45:51.424283", + "step": 119, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:45:51.540887", + "step": 119, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016811968758702278, + "timestamp": "2025-09-04 03:45:51.563695", + "step": 120, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:45:59.943776", + "step": 120, + "epoch": 1 + }, + { + "type": "pplx", + "content": 299.89610834083794, + "timestamp": "2025-09-04 03:45:59.945489", + "step": 120, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 120", + "timestamp": "2025-09-04 03:46:00.290401", + "step": 120, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 03:46:00.476561", + "step": 120, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08362013101577759, + "timestamp": "2025-09-04 03:46:00.514696", + "step": 121, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:46:00.596482", + "step": 121, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.059237729758024216, + "timestamp": "2025-09-04 03:46:00.611683", + "step": 122, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:46:00.727922", + "step": 122, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.039004936814308167, + "timestamp": "2025-09-04 03:46:00.749930", + "step": 123, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:46:00.824487", + "step": 123, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11705737560987473, + "timestamp": "2025-09-04 03:46:00.838610", + "step": 124, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:00.914119", + "step": 124, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06630095839500427, + "timestamp": "2025-09-04 03:46:00.929469", + "step": 125, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:46:01.019324", + "step": 125, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04516802728176117, + "timestamp": "2025-09-04 03:46:01.035861", + "step": 126, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:01.144582", + "step": 126, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04330586642026901, + "timestamp": "2025-09-04 03:46:01.165051", + "step": 127, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:46:01.259475", + "step": 127, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.061153244227170944, + "timestamp": "2025-09-04 03:46:01.277402", + "step": 128, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:01.374266", + "step": 128, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11837725341320038, + "timestamp": "2025-09-04 03:46:01.394579", + "step": 129, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:46:01.476506", + "step": 129, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02547391690313816, + "timestamp": "2025-09-04 03:46:01.491380", + "step": 130, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:01.598388", + "step": 130, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03671078011393547, + "timestamp": "2025-09-04 03:46:01.618545", + "step": 131, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:46:01.711084", + "step": 131, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15452733635902405, + "timestamp": "2025-09-04 03:46:01.728810", + "step": 132, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:01.804783", + "step": 132, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07617174834012985, + "timestamp": "2025-09-04 03:46:01.820045", + "step": 133, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:01.920338", + "step": 133, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.062410350888967514, + "timestamp": "2025-09-04 03:46:01.938924", + "step": 134, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:46:02.054545", + "step": 134, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0330992229282856, + "timestamp": "2025-09-04 03:46:02.076694", + "step": 135, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:46:02.186634", + "step": 135, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029934488236904144, + "timestamp": "2025-09-04 03:46:02.207952", + "step": 136, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:02.306911", + "step": 136, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048758380115032196, + "timestamp": "2025-09-04 03:46:02.327997", + "step": 137, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:02.424443", + "step": 137, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11269187927246094, + "timestamp": "2025-09-04 03:46:02.441749", + "step": 138, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:02.543693", + "step": 138, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0775558203458786, + "timestamp": "2025-09-04 03:46:02.562600", + "step": 139, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:02.658416", + "step": 139, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05858410522341728, + "timestamp": "2025-09-04 03:46:02.676400", + "step": 140, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:46:11.045850", + "step": 140, + "epoch": 1 + }, + { + "type": "pplx", + "content": 295.5688961078636, + "timestamp": "2025-09-04 03:46:11.048049", + "step": 140, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:11.145509", + "step": 140, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034822121262550354, + "timestamp": "2025-09-04 03:46:11.166258", + "step": 141, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:11.275463", + "step": 141, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025513645261526108, + "timestamp": "2025-09-04 03:46:11.295707", + "step": 142, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:11.405349", + "step": 142, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022908439859747887, + "timestamp": "2025-09-04 03:46:11.425606", + "step": 143, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:11.531596", + "step": 143, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05994102731347084, + "timestamp": "2025-09-04 03:46:11.552142", + "step": 144, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 03:46:11.680734", + "step": 144, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06833264976739883, + "timestamp": "2025-09-04 03:46:11.707754", + "step": 145, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:11.809626", + "step": 145, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07911453396081924, + "timestamp": "2025-09-04 03:46:11.828239", + "step": 146, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:11.934765", + "step": 146, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08064809441566467, + "timestamp": "2025-09-04 03:46:11.954473", + "step": 147, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:12.062843", + "step": 147, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03310004621744156, + "timestamp": "2025-09-04 03:46:12.083696", + "step": 148, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:46:12.166474", + "step": 148, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03188398852944374, + "timestamp": "2025-09-04 03:46:12.183268", + "step": 149, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:12.293783", + "step": 149, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03804198279976845, + "timestamp": "2025-09-04 03:46:12.314141", + "step": 150, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:12.418609", + "step": 150, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03391795977950096, + "timestamp": "2025-09-04 03:46:12.437588", + "step": 151, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:12.515641", + "step": 151, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024908579885959625, + "timestamp": "2025-09-04 03:46:12.530339", + "step": 152, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:46:12.621956", + "step": 152, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0160362645983696, + "timestamp": "2025-09-04 03:46:12.640839", + "step": 153, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:12.741410", + "step": 153, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030184214934706688, + "timestamp": "2025-09-04 03:46:12.760113", + "step": 154, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:12.839419", + "step": 154, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03809965401887894, + "timestamp": "2025-09-04 03:46:12.853334", + "step": 155, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:12.953947", + "step": 155, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03310762345790863, + "timestamp": "2025-09-04 03:46:12.973351", + "step": 156, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:13.079957", + "step": 156, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05042213574051857, + "timestamp": "2025-09-04 03:46:13.102326", + "step": 157, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:46:13.223808", + "step": 157, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03846399486064911, + "timestamp": "2025-09-04 03:46:13.245799", + "step": 158, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:13.354628", + "step": 158, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030467255041003227, + "timestamp": "2025-09-04 03:46:13.374922", + "step": 159, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:13.482931", + "step": 159, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07905561476945877, + "timestamp": "2025-09-04 03:46:13.503834", + "step": 160, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:46:21.876993", + "step": 160, + "epoch": 1 + }, + { + "type": "pplx", + "content": 299.0262854255139, + "timestamp": "2025-09-04 03:46:21.879135", + "step": 160, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 160", + "timestamp": "2025-09-04 03:46:22.373385", + "step": 160, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:22.469238", + "step": 160, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10933341830968857, + "timestamp": "2025-09-04 03:46:22.489584", + "step": 161, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:46:22.572640", + "step": 161, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029027054086327553, + "timestamp": "2025-09-04 03:46:22.587595", + "step": 162, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:22.688492", + "step": 162, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012869374826550484, + "timestamp": "2025-09-04 03:46:22.707182", + "step": 163, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:46:22.789587", + "step": 163, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03408244624733925, + "timestamp": "2025-09-04 03:46:22.805117", + "step": 164, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:22.897367", + "step": 164, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024805987253785133, + "timestamp": "2025-09-04 03:46:22.916271", + "step": 165, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:46:23.006515", + "step": 165, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02760004997253418, + "timestamp": "2025-09-04 03:46:23.023015", + "step": 166, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:46:23.115490", + "step": 166, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06649627536535263, + "timestamp": "2025-09-04 03:46:23.132373", + "step": 167, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:23.236020", + "step": 167, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08248773962259293, + "timestamp": "2025-09-04 03:46:23.255843", + "step": 168, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:23.364354", + "step": 168, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030205007642507553, + "timestamp": "2025-09-04 03:46:23.386069", + "step": 169, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:23.480780", + "step": 169, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07514587044715881, + "timestamp": "2025-09-04 03:46:23.498073", + "step": 170, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:46:23.576363", + "step": 170, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07391712814569473, + "timestamp": "2025-09-04 03:46:23.589072", + "step": 171, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:23.689317", + "step": 171, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04588964581489563, + "timestamp": "2025-09-04 03:46:23.708742", + "step": 172, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:23.787355", + "step": 172, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01995799131691456, + "timestamp": "2025-09-04 03:46:23.802655", + "step": 173, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:23.914759", + "step": 173, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04095141217112541, + "timestamp": "2025-09-04 03:46:23.934809", + "step": 174, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:24.040047", + "step": 174, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07867230474948883, + "timestamp": "2025-09-04 03:46:24.060041", + "step": 175, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:46:24.144862", + "step": 175, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08477169275283813, + "timestamp": "2025-09-04 03:46:24.161034", + "step": 176, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:24.258703", + "step": 176, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048271287232637405, + "timestamp": "2025-09-04 03:46:24.279194", + "step": 177, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:24.380449", + "step": 177, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056753601878881454, + "timestamp": "2025-09-04 03:46:24.399137", + "step": 178, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:46:24.481124", + "step": 178, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0256480872631073, + "timestamp": "2025-09-04 03:46:24.496079", + "step": 179, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:24.605160", + "step": 179, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00964992307126522, + "timestamp": "2025-09-04 03:46:24.626113", + "step": 180, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:46:32.989309", + "step": 180, + "epoch": 1 + }, + { + "type": "pplx", + "content": 304.3604161862801, + "timestamp": "2025-09-04 03:46:32.990886", + "step": 180, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:33.063611", + "step": 180, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08969972282648087, + "timestamp": "2025-09-04 03:46:33.078807", + "step": 181, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:33.186793", + "step": 181, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029566926881670952, + "timestamp": "2025-09-04 03:46:33.207034", + "step": 182, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:33.309505", + "step": 182, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016738848760724068, + "timestamp": "2025-09-04 03:46:33.328509", + "step": 183, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:33.404480", + "step": 183, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030050721019506454, + "timestamp": "2025-09-04 03:46:33.419146", + "step": 184, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:46:33.538599", + "step": 184, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032734520733356476, + "timestamp": "2025-09-04 03:46:33.562256", + "step": 185, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:33.660578", + "step": 185, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03777821734547615, + "timestamp": "2025-09-04 03:46:33.678971", + "step": 186, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:33.785505", + "step": 186, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07701154798269272, + "timestamp": "2025-09-04 03:46:33.805660", + "step": 187, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:33.911868", + "step": 187, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040181923657655716, + "timestamp": "2025-09-04 03:46:33.932359", + "step": 188, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:34.013114", + "step": 188, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10895628482103348, + "timestamp": "2025-09-04 03:46:34.028150", + "step": 189, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:46:34.111556", + "step": 189, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03597872704267502, + "timestamp": "2025-09-04 03:46:34.126652", + "step": 190, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:46:34.223903", + "step": 190, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020103169605135918, + "timestamp": "2025-09-04 03:46:34.241188", + "step": 191, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:34.350326", + "step": 191, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04040881618857384, + "timestamp": "2025-09-04 03:46:34.371538", + "step": 192, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:46:34.478562", + "step": 192, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05105682834982872, + "timestamp": "2025-09-04 03:46:34.501219", + "step": 193, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:34.606829", + "step": 193, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013259065337479115, + "timestamp": "2025-09-04 03:46:34.626713", + "step": 194, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:34.721624", + "step": 194, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05866575241088867, + "timestamp": "2025-09-04 03:46:34.738863", + "step": 195, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:34.815582", + "step": 195, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0314386822283268, + "timestamp": "2025-09-04 03:46:34.830274", + "step": 196, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:34.928107", + "step": 196, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08559767156839371, + "timestamp": "2025-09-04 03:46:34.948472", + "step": 197, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:35.041821", + "step": 197, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.051514316350221634, + "timestamp": "2025-09-04 03:46:35.059202", + "step": 198, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:46:35.153126", + "step": 198, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021353455260396004, + "timestamp": "2025-09-04 03:46:35.170412", + "step": 199, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:46:35.280731", + "step": 199, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006292775738984346, + "timestamp": "2025-09-04 03:46:35.301953", + "step": 200, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:46:43.677848", + "step": 200, + "epoch": 1 + }, + { + "type": "pplx", + "content": 308.7893902698641, + "timestamp": "2025-09-04 03:46:43.680096", + "step": 200, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 200", + "timestamp": "2025-09-04 03:46:44.033775", + "step": 200, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:46:44.114791", + "step": 200, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.13709776103496552, + "timestamp": "2025-09-04 03:46:44.131611", + "step": 201, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:44.234364", + "step": 201, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06784407794475555, + "timestamp": "2025-09-04 03:46:44.253436", + "step": 202, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:46:44.346048", + "step": 202, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023510560393333435, + "timestamp": "2025-09-04 03:46:44.363111", + "step": 203, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:44.440942", + "step": 203, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1330084502696991, + "timestamp": "2025-09-04 03:46:44.455646", + "step": 204, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:44.555190", + "step": 204, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02427174523472786, + "timestamp": "2025-09-04 03:46:44.576158", + "step": 205, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:44.654034", + "step": 205, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08518130332231522, + "timestamp": "2025-09-04 03:46:44.667993", + "step": 206, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:46:44.745924", + "step": 206, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028402844443917274, + "timestamp": "2025-09-04 03:46:44.759722", + "step": 207, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:44.862696", + "step": 207, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022264117375016212, + "timestamp": "2025-09-04 03:46:44.882252", + "step": 208, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:44.989666", + "step": 208, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014760280027985573, + "timestamp": "2025-09-04 03:46:45.011616", + "step": 209, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:45.120759", + "step": 209, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02477847971022129, + "timestamp": "2025-09-04 03:46:45.140323", + "step": 210, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:45.245385", + "step": 210, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09371557086706161, + "timestamp": "2025-09-04 03:46:45.264286", + "step": 211, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:45.368282", + "step": 211, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040149930864572525, + "timestamp": "2025-09-04 03:46:45.387896", + "step": 212, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:45.487708", + "step": 212, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05073068290948868, + "timestamp": "2025-09-04 03:46:45.508705", + "step": 213, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 03:46:45.724051", + "step": 213, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03666497766971588, + "timestamp": "2025-09-04 03:46:45.762808", + "step": 214, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:45.841828", + "step": 214, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025966059416532516, + "timestamp": "2025-09-04 03:46:45.855469", + "step": 215, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:45.957760", + "step": 215, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04828720539808273, + "timestamp": "2025-09-04 03:46:45.976941", + "step": 216, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:46.074755", + "step": 216, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008038034662604332, + "timestamp": "2025-09-04 03:46:46.094767", + "step": 217, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:46.198305", + "step": 217, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02476082369685173, + "timestamp": "2025-09-04 03:46:46.217094", + "step": 218, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:46.317114", + "step": 218, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03774585947394371, + "timestamp": "2025-09-04 03:46:46.335244", + "step": 219, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:46:46.435893", + "step": 219, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04572395235300064, + "timestamp": "2025-09-04 03:46:46.454869", + "step": 220, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:46:54.891309", + "step": 220, + "epoch": 1 + }, + { + "type": "pplx", + "content": 311.0440274571144, + "timestamp": "2025-09-04 03:46:54.893262", + "step": 220, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:54.966356", + "step": 220, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020546726882457733, + "timestamp": "2025-09-04 03:46:54.981421", + "step": 221, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:55.059442", + "step": 221, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047001611441373825, + "timestamp": "2025-09-04 03:46:55.073274", + "step": 222, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:46:55.182713", + "step": 222, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04990854486823082, + "timestamp": "2025-09-04 03:46:55.203243", + "step": 223, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:46:55.297066", + "step": 223, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03975335881114006, + "timestamp": "2025-09-04 03:46:55.314614", + "step": 224, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:55.412711", + "step": 224, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037995487451553345, + "timestamp": "2025-09-04 03:46:55.433055", + "step": 225, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:55.535515", + "step": 225, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06138302758336067, + "timestamp": "2025-09-04 03:46:55.554130", + "step": 226, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 03:46:55.774170", + "step": 226, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04723235219717026, + "timestamp": "2025-09-04 03:46:55.816481", + "step": 227, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:46:55.920763", + "step": 227, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013455665670335293, + "timestamp": "2025-09-04 03:46:55.940359", + "step": 228, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:46:56.016328", + "step": 228, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05484043434262276, + "timestamp": "2025-09-04 03:46:56.031474", + "step": 229, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1200 + ], + "flops": 24000145761984.0 + }, + "timestamp": "2025-09-04 03:46:56.210233", + "step": 229, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011813382618129253, + "timestamp": "2025-09-04 03:46:56.242851", + "step": 230, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:56.338311", + "step": 230, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04934141784906387, + "timestamp": "2025-09-04 03:46:56.355328", + "step": 231, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:46:56.462670", + "step": 231, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01455477625131607, + "timestamp": "2025-09-04 03:46:56.483049", + "step": 232, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:46:56.589711", + "step": 232, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035196904093027115, + "timestamp": "2025-09-04 03:46:56.611601", + "step": 233, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:56.713067", + "step": 233, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1223868653178215, + "timestamp": "2025-09-04 03:46:56.731468", + "step": 234, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:56.836088", + "step": 234, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023488562554121017, + "timestamp": "2025-09-04 03:46:56.855251", + "step": 235, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:46:57.059604", + "step": 235, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.041184525936841965, + "timestamp": "2025-09-04 03:46:57.099406", + "step": 236, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:46:57.192703", + "step": 236, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04467172920703888, + "timestamp": "2025-09-04 03:46:57.211674", + "step": 237, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:46:57.296253", + "step": 237, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.058227408677339554, + "timestamp": "2025-09-04 03:46:57.311130", + "step": 238, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:46:57.414555", + "step": 238, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024533184245228767, + "timestamp": "2025-09-04 03:46:57.433467", + "step": 239, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:46:57.533042", + "step": 239, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03975701332092285, + "timestamp": "2025-09-04 03:46:57.552325", + "step": 240, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:47:05.981740", + "step": 240, + "epoch": 1 + }, + { + "type": "pplx", + "content": 312.9286034141921, + "timestamp": "2025-09-04 03:47:05.983838", + "step": 240, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 240", + "timestamp": "2025-09-04 03:47:06.491752", + "step": 240, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:06.593414", + "step": 240, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022641537711024284, + "timestamp": "2025-09-04 03:47:06.614328", + "step": 241, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:47:06.718158", + "step": 241, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026233583688735962, + "timestamp": "2025-09-04 03:47:06.737180", + "step": 242, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:47:06.859887", + "step": 242, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030290089547634125, + "timestamp": "2025-09-04 03:47:06.882829", + "step": 243, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:06.989864", + "step": 243, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009676921181380749, + "timestamp": "2025-09-04 03:47:07.010467", + "step": 244, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:47:07.118651", + "step": 244, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02796107716858387, + "timestamp": "2025-09-04 03:47:07.141141", + "step": 245, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:07.251743", + "step": 245, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06316182762384415, + "timestamp": "2025-09-04 03:47:07.271865", + "step": 246, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:47:07.349164", + "step": 246, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.053157925605773926, + "timestamp": "2025-09-04 03:47:07.362748", + "step": 247, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:47:07.501150", + "step": 247, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1000499501824379, + "timestamp": "2025-09-04 03:47:07.527516", + "step": 248, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:47:07.603960", + "step": 248, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07537966966629028, + "timestamp": "2025-09-04 03:47:07.618912", + "step": 249, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:07.723085", + "step": 249, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04253721982240677, + "timestamp": "2025-09-04 03:47:07.742175", + "step": 250, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:07.843434", + "step": 250, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026042405515909195, + "timestamp": "2025-09-04 03:47:07.862102", + "step": 251, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:07.971729", + "step": 251, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0640282928943634, + "timestamp": "2025-09-04 03:47:07.992076", + "step": 252, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:08.094410", + "step": 252, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013043270446360111, + "timestamp": "2025-09-04 03:47:08.115388", + "step": 253, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:08.219527", + "step": 253, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02681521698832512, + "timestamp": "2025-09-04 03:47:08.238620", + "step": 254, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 03:47:08.416005", + "step": 254, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02740609645843506, + "timestamp": "2025-09-04 03:47:08.448567", + "step": 255, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:47:08.536920", + "step": 255, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06523073464632034, + "timestamp": "2025-09-04 03:47:08.553113", + "step": 256, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:08.645862", + "step": 256, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019541790708899498, + "timestamp": "2025-09-04 03:47:08.664680", + "step": 257, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:08.758789", + "step": 257, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02935168892145157, + "timestamp": "2025-09-04 03:47:08.775713", + "step": 258, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:08.870347", + "step": 258, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04195958375930786, + "timestamp": "2025-09-04 03:47:08.887314", + "step": 259, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:08.989716", + "step": 259, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013827347196638584, + "timestamp": "2025-09-04 03:47:09.008620", + "step": 260, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:47:17.480249", + "step": 260, + "epoch": 1 + }, + { + "type": "pplx", + "content": 313.3745874189351, + "timestamp": "2025-09-04 03:47:17.482337", + "step": 260, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:17.585602", + "step": 260, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056827034801244736, + "timestamp": "2025-09-04 03:47:17.607847", + "step": 261, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:17.707055", + "step": 261, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04780471324920654, + "timestamp": "2025-09-04 03:47:17.725413", + "step": 262, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:17.824753", + "step": 262, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024920674040913582, + "timestamp": "2025-09-04 03:47:17.843217", + "step": 263, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:47:17.953478", + "step": 263, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016696227714419365, + "timestamp": "2025-09-04 03:47:17.974500", + "step": 264, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:18.065874", + "step": 264, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0172750111669302, + "timestamp": "2025-09-04 03:47:18.084464", + "step": 265, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:47:18.194133", + "step": 265, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0052177682518959045, + "timestamp": "2025-09-04 03:47:18.214495", + "step": 266, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:47:18.298475", + "step": 266, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03147877752780914, + "timestamp": "2025-09-04 03:47:18.313335", + "step": 267, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:47:18.416050", + "step": 267, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03740302100777626, + "timestamp": "2025-09-04 03:47:18.435957", + "step": 268, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:47:18.634340", + "step": 268, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08886415511369705, + "timestamp": "2025-09-04 03:47:18.677352", + "step": 269, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:47:18.795835", + "step": 269, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024287065491080284, + "timestamp": "2025-09-04 03:47:18.818027", + "step": 270, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:18.913367", + "step": 270, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05668797716498375, + "timestamp": "2025-09-04 03:47:18.930814", + "step": 271, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:19.039604", + "step": 271, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05908737704157829, + "timestamp": "2025-09-04 03:47:19.060419", + "step": 272, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:19.164051", + "step": 272, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010910317301750183, + "timestamp": "2025-09-04 03:47:19.185981", + "step": 273, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:47:19.325988", + "step": 273, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019638676196336746, + "timestamp": "2025-09-04 03:47:19.351679", + "step": 274, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:47:19.436571", + "step": 274, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07036858052015305, + "timestamp": "2025-09-04 03:47:19.451806", + "step": 275, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:47:19.546137", + "step": 275, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026476135477423668, + "timestamp": "2025-09-04 03:47:19.563442", + "step": 276, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:19.661217", + "step": 276, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0284750796854496, + "timestamp": "2025-09-04 03:47:19.681378", + "step": 277, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:47:19.765056", + "step": 277, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10833613574504852, + "timestamp": "2025-09-04 03:47:19.780198", + "step": 278, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:19.872507", + "step": 278, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030009621754288673, + "timestamp": "2025-09-04 03:47:19.889386", + "step": 279, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:47:19.962887", + "step": 279, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05326032638549805, + "timestamp": "2025-09-04 03:47:19.976593", + "step": 280, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:47:28.367516", + "step": 280, + "epoch": 1 + }, + { + "type": "pplx", + "content": 312.74428797361605, + "timestamp": "2025-09-04 03:47:28.370395", + "step": 280, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 280", + "timestamp": "2025-09-04 03:47:28.839161", + "step": 280, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:47:28.912750", + "step": 280, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03643646836280823, + "timestamp": "2025-09-04 03:47:28.927377", + "step": 281, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:47:29.006960", + "step": 281, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04280658811330795, + "timestamp": "2025-09-04 03:47:29.020511", + "step": 282, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:29.117476", + "step": 282, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01976653002202511, + "timestamp": "2025-09-04 03:47:29.134794", + "step": 283, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:29.230289", + "step": 283, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0358404815196991, + "timestamp": "2025-09-04 03:47:29.247810", + "step": 284, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:29.346237", + "step": 284, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029284300282597542, + "timestamp": "2025-09-04 03:47:29.366289", + "step": 285, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:47:29.485868", + "step": 285, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1099967360496521, + "timestamp": "2025-09-04 03:47:29.507498", + "step": 286, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:29.604126", + "step": 286, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023914791643619537, + "timestamp": "2025-09-04 03:47:29.621110", + "step": 287, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:47:29.760597", + "step": 287, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05426377058029175, + "timestamp": "2025-09-04 03:47:29.787210", + "step": 288, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:29.879326", + "step": 288, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.050082337111234665, + "timestamp": "2025-09-04 03:47:29.897720", + "step": 289, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:29.994274", + "step": 289, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05403923988342285, + "timestamp": "2025-09-04 03:47:30.011501", + "step": 290, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:30.111748", + "step": 290, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047084759920835495, + "timestamp": "2025-09-04 03:47:30.130634", + "step": 291, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:47:30.210351", + "step": 291, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07940501719713211, + "timestamp": "2025-09-04 03:47:30.224933", + "step": 292, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:30.323714", + "step": 292, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09987480938434601, + "timestamp": "2025-09-04 03:47:30.344265", + "step": 293, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 03:47:30.419688", + "step": 293, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0861741453409195, + "timestamp": "2025-09-04 03:47:30.432119", + "step": 294, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:30.541849", + "step": 294, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03038984164595604, + "timestamp": "2025-09-04 03:47:30.561788", + "step": 295, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:30.663178", + "step": 295, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017293287441134453, + "timestamp": "2025-09-04 03:47:30.682096", + "step": 296, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:47:30.766176", + "step": 296, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0584249384701252, + "timestamp": "2025-09-04 03:47:30.782879", + "step": 297, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:30.878075", + "step": 297, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.053005725145339966, + "timestamp": "2025-09-04 03:47:30.895153", + "step": 298, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:30.996825", + "step": 298, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015453525818884373, + "timestamp": "2025-09-04 03:47:31.015301", + "step": 299, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:47:31.125136", + "step": 299, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013833635486662388, + "timestamp": "2025-09-04 03:47:31.145828", + "step": 300, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:47:39.537720", + "step": 300, + "epoch": 1 + }, + { + "type": "pplx", + "content": 311.2226619091774, + "timestamp": "2025-09-04 03:47:39.539677", + "step": 300, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:39.635130", + "step": 300, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005779569037258625, + "timestamp": "2025-09-04 03:47:39.655327", + "step": 301, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:47:39.731923", + "step": 301, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11063537001609802, + "timestamp": "2025-09-04 03:47:39.745638", + "step": 302, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:39.839960", + "step": 302, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02686300128698349, + "timestamp": "2025-09-04 03:47:39.857135", + "step": 303, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:39.963489", + "step": 303, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06041613593697548, + "timestamp": "2025-09-04 03:47:39.983975", + "step": 304, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:40.083742", + "step": 304, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01706847734749317, + "timestamp": "2025-09-04 03:47:40.104803", + "step": 305, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:40.210413", + "step": 305, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011196613311767578, + "timestamp": "2025-09-04 03:47:40.230267", + "step": 306, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:47:40.320630", + "step": 306, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07578642666339874, + "timestamp": "2025-09-04 03:47:40.337139", + "step": 307, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:40.430408", + "step": 307, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04545224457979202, + "timestamp": "2025-09-04 03:47:40.448605", + "step": 308, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:40.549272", + "step": 308, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.041280921548604965, + "timestamp": "2025-09-04 03:47:40.570265", + "step": 309, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:47:40.668868", + "step": 309, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05615556985139847, + "timestamp": "2025-09-04 03:47:40.682155", + "step": 310, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:47:40.785194", + "step": 310, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07136084884405136, + "timestamp": "2025-09-04 03:47:40.804167", + "step": 311, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:40.914245", + "step": 311, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019209880381822586, + "timestamp": "2025-09-04 03:47:40.933587", + "step": 312, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:47:41.019153", + "step": 312, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10823129117488861, + "timestamp": "2025-09-04 03:47:41.036119", + "step": 313, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:41.135194", + "step": 313, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03277946263551712, + "timestamp": "2025-09-04 03:47:41.153529", + "step": 314, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:47:41.237996", + "step": 314, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06067274883389473, + "timestamp": "2025-09-04 03:47:41.253446", + "step": 315, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:47:41.381637", + "step": 315, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01747446320950985, + "timestamp": "2025-09-04 03:47:41.405328", + "step": 316, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:41.496751", + "step": 316, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024807121604681015, + "timestamp": "2025-09-04 03:47:41.515635", + "step": 317, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:41.620768", + "step": 317, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008701799437403679, + "timestamp": "2025-09-04 03:47:41.640483", + "step": 318, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:47:41.746608", + "step": 318, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03150641545653343, + "timestamp": "2025-09-04 03:47:41.766485", + "step": 319, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:47:41.902464", + "step": 319, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04005073383450508, + "timestamp": "2025-09-04 03:47:41.929101", + "step": 320, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:47:50.311558", + "step": 320, + "epoch": 1 + }, + { + "type": "pplx", + "content": 309.1398029399646, + "timestamp": "2025-09-04 03:47:50.313320", + "step": 320, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 320", + "timestamp": "2025-09-04 03:47:50.768107", + "step": 320, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:47:50.886087", + "step": 320, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024150997400283813, + "timestamp": "2025-09-04 03:47:50.911098", + "step": 321, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:51.005388", + "step": 321, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013538923114538193, + "timestamp": "2025-09-04 03:47:51.022833", + "step": 322, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:47:51.158175", + "step": 322, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03433951735496521, + "timestamp": "2025-09-04 03:47:51.183772", + "step": 323, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:47:51.282848", + "step": 323, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04349237680435181, + "timestamp": "2025-09-04 03:47:51.301876", + "step": 324, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:47:51.408869", + "step": 324, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03356247767806053, + "timestamp": "2025-09-04 03:47:51.431173", + "step": 325, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:47:51.534373", + "step": 325, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04008670523762703, + "timestamp": "2025-09-04 03:47:51.553175", + "step": 326, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:47:51.629153", + "step": 326, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007040772121399641, + "timestamp": "2025-09-04 03:47:51.642842", + "step": 327, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:51.735567", + "step": 327, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03876568377017975, + "timestamp": "2025-09-04 03:47:51.753448", + "step": 328, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:47:51.844898", + "step": 328, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03228950873017311, + "timestamp": "2025-09-04 03:47:51.863955", + "step": 329, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:47:51.958554", + "step": 329, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09584397077560425, + "timestamp": "2025-09-04 03:47:51.975592", + "step": 330, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:52.076487", + "step": 330, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05004340410232544, + "timestamp": "2025-09-04 03:47:52.095272", + "step": 331, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:47:52.173583", + "step": 331, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03638507053256035, + "timestamp": "2025-09-04 03:47:52.188303", + "step": 332, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:47:52.280377", + "step": 332, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05368280038237572, + "timestamp": "2025-09-04 03:47:52.299280", + "step": 333, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:47:52.376428", + "step": 333, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05048099532723427, + "timestamp": "2025-09-04 03:47:52.390448", + "step": 334, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:47:52.494290", + "step": 334, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0237380713224411, + "timestamp": "2025-09-04 03:47:52.513321", + "step": 335, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:47:52.616448", + "step": 335, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014010374434292316, + "timestamp": "2025-09-04 03:47:52.636162", + "step": 336, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:47:52.724698", + "step": 336, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.052086420357227325, + "timestamp": "2025-09-04 03:47:52.743063", + "step": 337, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:47:52.854009", + "step": 337, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04896777868270874, + "timestamp": "2025-09-04 03:47:52.874728", + "step": 338, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:47:52.974955", + "step": 338, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06648825109004974, + "timestamp": "2025-09-04 03:47:52.993821", + "step": 339, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:47:53.081148", + "step": 339, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09628743678331375, + "timestamp": "2025-09-04 03:47:53.097474", + "step": 340, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:02.050472", + "step": 340, + "epoch": 1 + }, + { + "type": "pplx", + "content": 307.83904285220194, + "timestamp": "2025-09-04 03:48:02.053159", + "step": 340, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:48:02.127903", + "step": 340, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027161644771695137, + "timestamp": "2025-09-04 03:48:02.142965", + "step": 341, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:48:02.218754", + "step": 341, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0412270650267601, + "timestamp": "2025-09-04 03:48:02.232083", + "step": 342, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:48:02.339531", + "step": 342, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.058129601180553436, + "timestamp": "2025-09-04 03:48:02.359457", + "step": 343, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:02.470920", + "step": 343, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04298894852399826, + "timestamp": "2025-09-04 03:48:02.492120", + "step": 344, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:02.591173", + "step": 344, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04139872267842293, + "timestamp": "2025-09-04 03:48:02.611744", + "step": 345, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:48:02.748595", + "step": 345, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036456745117902756, + "timestamp": "2025-09-04 03:48:02.774427", + "step": 346, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:02.879028", + "step": 346, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09559651464223862, + "timestamp": "2025-09-04 03:48:02.898191", + "step": 347, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:48:02.971130", + "step": 347, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036566995084285736, + "timestamp": "2025-09-04 03:48:02.984752", + "step": 348, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:03.082772", + "step": 348, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04065980389714241, + "timestamp": "2025-09-04 03:48:03.103069", + "step": 349, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:03.208487", + "step": 349, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05061135068535805, + "timestamp": "2025-09-04 03:48:03.227437", + "step": 350, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:48:03.303490", + "step": 350, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016793936491012573, + "timestamp": "2025-09-04 03:48:03.317128", + "step": 351, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:48:03.401892", + "step": 351, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02601003088057041, + "timestamp": "2025-09-04 03:48:03.417643", + "step": 352, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:03.519469", + "step": 352, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02657604031264782, + "timestamp": "2025-09-04 03:48:03.540081", + "step": 353, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:48:03.627443", + "step": 353, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02620949223637581, + "timestamp": "2025-09-04 03:48:03.643009", + "step": 354, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:48:03.729847", + "step": 354, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05669962242245674, + "timestamp": "2025-09-04 03:48:03.745335", + "step": 355, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:03.847153", + "step": 355, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028145290911197662, + "timestamp": "2025-09-04 03:48:03.866641", + "step": 356, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:48:03.986575", + "step": 356, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0207599475979805, + "timestamp": "2025-09-04 03:48:04.011984", + "step": 357, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:48:04.091711", + "step": 357, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030706727877259254, + "timestamp": "2025-09-04 03:48:04.105863", + "step": 358, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:48:04.212114", + "step": 358, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010737722739577293, + "timestamp": "2025-09-04 03:48:04.232012", + "step": 359, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 03:48:04.362847", + "step": 359, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029181912541389465, + "timestamp": "2025-09-04 03:48:04.387540", + "step": 360, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:12.919636", + "step": 360, + "epoch": 1 + }, + { + "type": "pplx", + "content": 307.5586884862369, + "timestamp": "2025-09-04 03:48:12.935018", + "step": 360, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 360", + "timestamp": "2025-09-04 03:48:13.409472", + "step": 360, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:48:13.515183", + "step": 360, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03834038972854614, + "timestamp": "2025-09-04 03:48:13.529742", + "step": 361, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:48:13.687701", + "step": 361, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025468185544013977, + "timestamp": "2025-09-04 03:48:13.706710", + "step": 362, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:13.836524", + "step": 362, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0641966164112091, + "timestamp": "2025-09-04 03:48:13.855536", + "step": 363, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:48:13.996064", + "step": 363, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03254721313714981, + "timestamp": "2025-09-04 03:48:14.017115", + "step": 364, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:48:14.164508", + "step": 364, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.14557507634162903, + "timestamp": "2025-09-04 03:48:14.186890", + "step": 365, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:14.293921", + "step": 365, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03721340000629425, + "timestamp": "2025-09-04 03:48:14.311196", + "step": 366, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:48:14.437001", + "step": 366, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011114726774394512, + "timestamp": "2025-09-04 03:48:14.457608", + "step": 367, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:48:14.600803", + "step": 367, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0390244759619236, + "timestamp": "2025-09-04 03:48:14.618979", + "step": 368, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:48:14.746624", + "step": 368, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09442558139562607, + "timestamp": "2025-09-04 03:48:14.767824", + "step": 369, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:14.883101", + "step": 369, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07992935180664062, + "timestamp": "2025-09-04 03:48:14.901810", + "step": 370, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:15.024280", + "step": 370, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05484499782323837, + "timestamp": "2025-09-04 03:48:15.042914", + "step": 371, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:48:15.147914", + "step": 371, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06619900465011597, + "timestamp": "2025-09-04 03:48:15.164438", + "step": 372, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:48:15.250873", + "step": 372, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03658528998494148, + "timestamp": "2025-09-04 03:48:15.303114", + "step": 373, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:15.433580", + "step": 373, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022451380267739296, + "timestamp": "2025-09-04 03:48:15.452866", + "step": 374, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 03:48:15.615676", + "step": 374, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05221700668334961, + "timestamp": "2025-09-04 03:48:15.639715", + "step": 375, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1040 + ], + "flops": 20800126336064.0 + }, + "timestamp": "2025-09-04 03:48:15.819023", + "step": 375, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09046898037195206, + "timestamp": "2025-09-04 03:48:15.849378", + "step": 376, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:48:15.998775", + "step": 376, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07325262576341629, + "timestamp": "2025-09-04 03:48:16.023460", + "step": 377, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:48:16.119471", + "step": 377, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.042737096548080444, + "timestamp": "2025-09-04 03:48:16.133559", + "step": 378, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:16.283766", + "step": 378, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02026101015508175, + "timestamp": "2025-09-04 03:48:16.304404", + "step": 379, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:16.427741", + "step": 379, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011748573742806911, + "timestamp": "2025-09-04 03:48:16.445240", + "step": 380, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:25.287729", + "step": 380, + "epoch": 1 + }, + { + "type": "pplx", + "content": 311.8221380685682, + "timestamp": "2025-09-04 03:48:25.295794", + "step": 380, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:25.392305", + "step": 380, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013346564956009388, + "timestamp": "2025-09-04 03:48:25.410543", + "step": 381, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:25.535167", + "step": 381, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013797925785183907, + "timestamp": "2025-09-04 03:48:25.555859", + "step": 382, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:25.681692", + "step": 382, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015700260177254677, + "timestamp": "2025-09-04 03:48:25.701141", + "step": 383, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:25.835149", + "step": 383, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024386491626501083, + "timestamp": "2025-09-04 03:48:25.859596", + "step": 384, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:25.987712", + "step": 384, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016429277136921883, + "timestamp": "2025-09-04 03:48:26.006472", + "step": 385, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:48:26.122316", + "step": 385, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011799799278378487, + "timestamp": "2025-09-04 03:48:26.141876", + "step": 386, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:26.248691", + "step": 386, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06375425308942795, + "timestamp": "2025-09-04 03:48:26.265309", + "step": 387, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:48:26.346698", + "step": 387, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035253461450338364, + "timestamp": "2025-09-04 03:48:26.361086", + "step": 388, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:26.473191", + "step": 388, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05822211131453514, + "timestamp": "2025-09-04 03:48:26.494275", + "step": 389, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:48:26.575019", + "step": 389, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035860493779182434, + "timestamp": "2025-09-04 03:48:26.588671", + "step": 390, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:48:26.674728", + "step": 390, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025279760360717773, + "timestamp": "2025-09-04 03:48:26.688319", + "step": 391, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:48:26.776869", + "step": 391, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022274592891335487, + "timestamp": "2025-09-04 03:48:26.792909", + "step": 392, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:26.892516", + "step": 392, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026588624343276024, + "timestamp": "2025-09-04 03:48:26.912495", + "step": 393, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:27.007916", + "step": 393, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0037299662362784147, + "timestamp": "2025-09-04 03:48:27.024992", + "step": 394, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:27.130031", + "step": 394, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016043562442064285, + "timestamp": "2025-09-04 03:48:27.149089", + "step": 395, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:48:27.261367", + "step": 395, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022281493991613388, + "timestamp": "2025-09-04 03:48:27.282593", + "step": 396, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:27.381479", + "step": 396, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019878646358847618, + "timestamp": "2025-09-04 03:48:27.401844", + "step": 397, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:27.493393", + "step": 397, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06282302737236023, + "timestamp": "2025-09-04 03:48:27.510053", + "step": 398, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:27.627066", + "step": 398, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016571981832385063, + "timestamp": "2025-09-04 03:48:27.646198", + "step": 399, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:27.748235", + "step": 399, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05791422352194786, + "timestamp": "2025-09-04 03:48:27.767514", + "step": 400, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:36.321805", + "step": 400, + "epoch": 1 + }, + { + "type": "pplx", + "content": 316.7635327191992, + "timestamp": "2025-09-04 03:48:36.324054", + "step": 400, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 400", + "timestamp": "2025-09-04 03:48:36.675112", + "step": 400, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:48:36.774708", + "step": 400, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022503888234496117, + "timestamp": "2025-09-04 03:48:36.795589", + "step": 401, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:36.889466", + "step": 401, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021656574681401253, + "timestamp": "2025-09-04 03:48:36.906649", + "step": 402, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:48:36.992132", + "step": 402, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036612629890441895, + "timestamp": "2025-09-04 03:48:37.007536", + "step": 403, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:48:37.081308", + "step": 403, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015442884527146816, + "timestamp": "2025-09-04 03:48:37.094723", + "step": 404, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:37.185279", + "step": 404, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02640901878476143, + "timestamp": "2025-09-04 03:48:37.203874", + "step": 405, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:48:37.290602", + "step": 405, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05808902904391289, + "timestamp": "2025-09-04 03:48:37.305715", + "step": 406, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:37.401155", + "step": 406, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02911917120218277, + "timestamp": "2025-09-04 03:48:37.418222", + "step": 407, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:48:37.494702", + "step": 407, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012547432444989681, + "timestamp": "2025-09-04 03:48:37.508723", + "step": 408, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:37.615037", + "step": 408, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029442116618156433, + "timestamp": "2025-09-04 03:48:37.637295", + "step": 409, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:48:37.743694", + "step": 409, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017742721363902092, + "timestamp": "2025-09-04 03:48:37.763621", + "step": 410, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:37.854723", + "step": 410, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.13883210718631744, + "timestamp": "2025-09-04 03:48:37.871242", + "step": 411, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:37.964918", + "step": 411, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06384449452161789, + "timestamp": "2025-09-04 03:48:37.982828", + "step": 412, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:48:38.096961", + "step": 412, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021444780752062798, + "timestamp": "2025-09-04 03:48:38.120984", + "step": 413, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:38.213825", + "step": 413, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023383496329188347, + "timestamp": "2025-09-04 03:48:38.230925", + "step": 414, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:38.334573", + "step": 414, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011450543999671936, + "timestamp": "2025-09-04 03:48:38.353477", + "step": 415, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:48:38.430473", + "step": 415, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03750946745276451, + "timestamp": "2025-09-04 03:48:38.445210", + "step": 416, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:38.543176", + "step": 416, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011969326063990593, + "timestamp": "2025-09-04 03:48:38.563556", + "step": 417, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:48:38.639745", + "step": 417, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07576386630535126, + "timestamp": "2025-09-04 03:48:38.653215", + "step": 418, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:48:38.787749", + "step": 418, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009221615269780159, + "timestamp": "2025-09-04 03:48:38.813311", + "step": 419, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:38.913752", + "step": 419, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020009953528642654, + "timestamp": "2025-09-04 03:48:38.932761", + "step": 420, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:47.342665", + "step": 420, + "epoch": 1 + }, + { + "type": "pplx", + "content": 319.4097698572594, + "timestamp": "2025-09-04 03:48:47.344661", + "step": 420, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:47.449253", + "step": 420, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.053698133677244186, + "timestamp": "2025-09-04 03:48:47.471467", + "step": 421, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:47.581639", + "step": 421, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009336840361356735, + "timestamp": "2025-09-04 03:48:47.602046", + "step": 422, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:48:47.722268", + "step": 422, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017453299835324287, + "timestamp": "2025-09-04 03:48:47.744067", + "step": 423, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:48:47.827080", + "step": 423, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03400561586022377, + "timestamp": "2025-09-04 03:48:47.842904", + "step": 424, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:48:47.936119", + "step": 424, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012152721174061298, + "timestamp": "2025-09-04 03:48:47.955176", + "step": 425, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:48.055267", + "step": 425, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.052000127732753754, + "timestamp": "2025-09-04 03:48:48.073372", + "step": 426, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:48.167311", + "step": 426, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01812596619129181, + "timestamp": "2025-09-04 03:48:48.184092", + "step": 427, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:48:48.295139", + "step": 427, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038504090160131454, + "timestamp": "2025-09-04 03:48:48.316458", + "step": 428, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:48.416934", + "step": 428, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07306548207998276, + "timestamp": "2025-09-04 03:48:48.438023", + "step": 429, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:48:48.533780", + "step": 429, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08940979093313217, + "timestamp": "2025-09-04 03:48:48.551198", + "step": 430, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:48.647281", + "step": 430, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01325355563312769, + "timestamp": "2025-09-04 03:48:48.664338", + "step": 431, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:48:48.759489", + "step": 431, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.046491898596286774, + "timestamp": "2025-09-04 03:48:48.777517", + "step": 432, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:48:48.883612", + "step": 432, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12319537252187729, + "timestamp": "2025-09-04 03:48:48.905497", + "step": 433, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:49.016164", + "step": 433, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0106568094342947, + "timestamp": "2025-09-04 03:48:49.036142", + "step": 434, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:48:49.136703", + "step": 434, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009872270748019218, + "timestamp": "2025-09-04 03:48:49.155241", + "step": 435, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:49.256981", + "step": 435, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016583112999796867, + "timestamp": "2025-09-04 03:48:49.275694", + "step": 436, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:48:49.360494", + "step": 436, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008935445919632912, + "timestamp": "2025-09-04 03:48:49.376832", + "step": 437, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:49.468735", + "step": 437, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03650280088186264, + "timestamp": "2025-09-04 03:48:49.485004", + "step": 438, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:49.581069", + "step": 438, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017082802951335907, + "timestamp": "2025-09-04 03:48:49.598116", + "step": 439, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:49.702688", + "step": 439, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007683966308832169, + "timestamp": "2025-09-04 03:48:49.722439", + "step": 440, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:48:58.199774", + "step": 440, + "epoch": 1 + }, + { + "type": "pplx", + "content": 321.40916814073927, + "timestamp": "2025-09-04 03:48:58.202230", + "step": 440, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 440", + "timestamp": "2025-09-04 03:48:58.565856", + "step": 440, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:48:58.664276", + "step": 440, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023604106158018112, + "timestamp": "2025-09-04 03:48:58.684199", + "step": 441, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 03:48:58.904564", + "step": 441, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012493900023400784, + "timestamp": "2025-09-04 03:48:58.946767", + "step": 442, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:48:59.054947", + "step": 442, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027849415317177773, + "timestamp": "2025-09-04 03:48:59.074061", + "step": 443, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:48:59.168688", + "step": 443, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027136214077472687, + "timestamp": "2025-09-04 03:48:59.186008", + "step": 444, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:48:59.260823", + "step": 444, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06847383081912994, + "timestamp": "2025-09-04 03:48:59.275370", + "step": 445, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:48:59.387276", + "step": 445, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006238785106688738, + "timestamp": "2025-09-04 03:48:59.407172", + "step": 446, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:48:59.495549", + "step": 446, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.060996416956186295, + "timestamp": "2025-09-04 03:48:59.510441", + "step": 447, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:48:59.602462", + "step": 447, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04064783453941345, + "timestamp": "2025-09-04 03:48:59.619374", + "step": 448, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:48:59.724508", + "step": 448, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08736442029476166, + "timestamp": "2025-09-04 03:48:59.745799", + "step": 449, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:48:59.850860", + "step": 449, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0009415106615051627, + "timestamp": "2025-09-04 03:48:59.869303", + "step": 450, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:48:59.964786", + "step": 450, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.043740540742874146, + "timestamp": "2025-09-04 03:48:59.981465", + "step": 451, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:00.076418", + "step": 451, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015802182257175446, + "timestamp": "2025-09-04 03:49:00.093653", + "step": 452, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:00.195759", + "step": 452, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07930180430412292, + "timestamp": "2025-09-04 03:49:00.216444", + "step": 453, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:49:00.312721", + "step": 453, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01712995208799839, + "timestamp": "2025-09-04 03:49:00.329671", + "step": 454, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:49:00.439710", + "step": 454, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06034353747963905, + "timestamp": "2025-09-04 03:49:00.460043", + "step": 455, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:00.552284", + "step": 455, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008295533247292042, + "timestamp": "2025-09-04 03:49:00.569255", + "step": 456, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:49:00.653046", + "step": 456, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019054660573601723, + "timestamp": "2025-09-04 03:49:00.669066", + "step": 457, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:49:00.767740", + "step": 457, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01991303637623787, + "timestamp": "2025-09-04 03:49:00.784981", + "step": 458, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:00.886589", + "step": 458, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033343605697155, + "timestamp": "2025-09-04 03:49:00.904779", + "step": 459, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:01.005430", + "step": 459, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07760113477706909, + "timestamp": "2025-09-04 03:49:01.024576", + "step": 460, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:49:09.462043", + "step": 460, + "epoch": 1 + }, + { + "type": "pplx", + "content": 324.06946329758586, + "timestamp": "2025-09-04 03:49:09.463792", + "step": 460, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:49:09.541744", + "step": 460, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04705421254038811, + "timestamp": "2025-09-04 03:49:09.558323", + "step": 461, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:09.663299", + "step": 461, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004760258831083775, + "timestamp": "2025-09-04 03:49:09.683378", + "step": 462, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:09.775625", + "step": 462, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02817857638001442, + "timestamp": "2025-09-04 03:49:09.792725", + "step": 463, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:09.895742", + "step": 463, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04996614158153534, + "timestamp": "2025-09-04 03:49:09.915742", + "step": 464, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:10.013902", + "step": 464, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03526470810174942, + "timestamp": "2025-09-04 03:49:10.034308", + "step": 465, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:49:10.118179", + "step": 465, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.061329569667577744, + "timestamp": "2025-09-04 03:49:10.133268", + "step": 466, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:49:10.335498", + "step": 466, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013789204880595207, + "timestamp": "2025-09-04 03:49:10.374743", + "step": 467, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:49:10.485354", + "step": 467, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.049680087715387344, + "timestamp": "2025-09-04 03:49:10.506771", + "step": 468, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:49:10.581897", + "step": 468, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05645358934998512, + "timestamp": "2025-09-04 03:49:10.597034", + "step": 469, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:49:10.706775", + "step": 469, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01656719297170639, + "timestamp": "2025-09-04 03:49:10.727119", + "step": 470, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:49:10.805413", + "step": 470, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01675380952656269, + "timestamp": "2025-09-04 03:49:10.819234", + "step": 471, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:49:10.891804", + "step": 471, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005480926018208265, + "timestamp": "2025-09-04 03:49:10.905273", + "step": 472, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:49:11.023420", + "step": 472, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037396758794784546, + "timestamp": "2025-09-04 03:49:11.048758", + "step": 473, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:49:11.126774", + "step": 473, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029778504744172096, + "timestamp": "2025-09-04 03:49:11.140580", + "step": 474, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 03:49:11.277106", + "step": 474, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0628209188580513, + "timestamp": "2025-09-04 03:49:11.301517", + "step": 475, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:11.400955", + "step": 475, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02091328613460064, + "timestamp": "2025-09-04 03:49:11.420085", + "step": 476, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:11.510953", + "step": 476, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.060857050120830536, + "timestamp": "2025-09-04 03:49:11.529984", + "step": 477, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:11.623012", + "step": 477, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03812684863805771, + "timestamp": "2025-09-04 03:49:11.639932", + "step": 478, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:49:11.725628", + "step": 478, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09244571626186371, + "timestamp": "2025-09-04 03:49:11.741008", + "step": 479, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:11.833497", + "step": 479, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05786127969622612, + "timestamp": "2025-09-04 03:49:11.851190", + "step": 480, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:49:20.202978", + "step": 480, + "epoch": 1 + }, + { + "type": "pplx", + "content": 328.5239349757793, + "timestamp": "2025-09-04 03:49:20.204938", + "step": 480, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 480", + "timestamp": "2025-09-04 03:49:20.553378", + "step": 480, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:49:20.657173", + "step": 480, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047159433364868164, + "timestamp": "2025-09-04 03:49:20.679232", + "step": 481, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:20.773246", + "step": 481, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07475479692220688, + "timestamp": "2025-09-04 03:49:20.790699", + "step": 482, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:20.893431", + "step": 482, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012311739847064018, + "timestamp": "2025-09-04 03:49:20.912687", + "step": 483, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:49:21.022565", + "step": 483, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011123532429337502, + "timestamp": "2025-09-04 03:49:21.043679", + "step": 484, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:49:21.148538", + "step": 484, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06513547897338867, + "timestamp": "2025-09-04 03:49:21.170605", + "step": 485, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:21.276579", + "step": 485, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019697437062859535, + "timestamp": "2025-09-04 03:49:21.296331", + "step": 486, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:21.403122", + "step": 486, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06598882377147675, + "timestamp": "2025-09-04 03:49:21.422840", + "step": 487, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:21.529193", + "step": 487, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0512818843126297, + "timestamp": "2025-09-04 03:49:21.549823", + "step": 488, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:49:21.624011", + "step": 488, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021413128823041916, + "timestamp": "2025-09-04 03:49:21.638832", + "step": 489, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:49:21.723109", + "step": 489, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018553882837295532, + "timestamp": "2025-09-04 03:49:21.738095", + "step": 490, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:49:21.839249", + "step": 490, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11178267747163773, + "timestamp": "2025-09-04 03:49:21.856511", + "step": 491, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:49:21.933185", + "step": 491, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07693499326705933, + "timestamp": "2025-09-04 03:49:21.947886", + "step": 492, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:22.035794", + "step": 492, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03976715728640556, + "timestamp": "2025-09-04 03:49:22.053973", + "step": 493, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:49:22.137152", + "step": 493, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025103233754634857, + "timestamp": "2025-09-04 03:49:22.152121", + "step": 494, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:49:22.259839", + "step": 494, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011973893269896507, + "timestamp": "2025-09-04 03:49:22.279867", + "step": 495, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:22.378194", + "step": 495, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08681105077266693, + "timestamp": "2025-09-04 03:49:22.397335", + "step": 496, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:49:22.471677", + "step": 496, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0133007001131773, + "timestamp": "2025-09-04 03:49:22.486802", + "step": 497, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:49:22.562649", + "step": 497, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05417775362730026, + "timestamp": "2025-09-04 03:49:22.576169", + "step": 498, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:49:22.652206", + "step": 498, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04143834114074707, + "timestamp": "2025-09-04 03:49:22.665755", + "step": 499, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:49:22.749786", + "step": 499, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021494613960385323, + "timestamp": "2025-09-04 03:49:22.765498", + "step": 500, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:49:31.153962", + "step": 500, + "epoch": 1 + }, + { + "type": "pplx", + "content": 332.01159701525927, + "timestamp": "2025-09-04 03:49:31.156242", + "step": 500, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:49:31.271328", + "step": 500, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.057877492159605026, + "timestamp": "2025-09-04 03:49:31.295176", + "step": 501, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:31.398872", + "step": 501, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023877454921603203, + "timestamp": "2025-09-04 03:49:31.418005", + "step": 502, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:31.518096", + "step": 502, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017220618203282356, + "timestamp": "2025-09-04 03:49:31.536414", + "step": 503, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:49:31.647385", + "step": 503, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03338051587343216, + "timestamp": "2025-09-04 03:49:31.668588", + "step": 504, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:31.760724", + "step": 504, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008282708935439587, + "timestamp": "2025-09-04 03:49:31.779547", + "step": 505, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:49:31.854732", + "step": 505, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027649713680148125, + "timestamp": "2025-09-04 03:49:31.868026", + "step": 506, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:49:31.978236", + "step": 506, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07461496442556381, + "timestamp": "2025-09-04 03:49:31.998627", + "step": 507, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:32.101885", + "step": 507, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034678079187870026, + "timestamp": "2025-09-04 03:49:32.121824", + "step": 508, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:32.225900", + "step": 508, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017775679007172585, + "timestamp": "2025-09-04 03:49:32.247614", + "step": 509, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:49:32.324639", + "step": 509, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021030427888035774, + "timestamp": "2025-09-04 03:49:32.337950", + "step": 510, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:32.431041", + "step": 510, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0012499457225203514, + "timestamp": "2025-09-04 03:49:32.448340", + "step": 511, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:32.550823", + "step": 511, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05383225530385971, + "timestamp": "2025-09-04 03:49:32.570216", + "step": 512, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:32.673612", + "step": 512, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09943293035030365, + "timestamp": "2025-09-04 03:49:32.695354", + "step": 513, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:49:32.804728", + "step": 513, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.067501500248909, + "timestamp": "2025-09-04 03:49:32.825120", + "step": 514, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:49:32.897035", + "step": 514, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056289635598659515, + "timestamp": "2025-09-04 03:49:32.909856", + "step": 515, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:33.004917", + "step": 515, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020486541092395782, + "timestamp": "2025-09-04 03:49:33.022917", + "step": 516, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:33.113978", + "step": 516, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09998507797718048, + "timestamp": "2025-09-04 03:49:33.132916", + "step": 517, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:49:33.234964", + "step": 517, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018785972148180008, + "timestamp": "2025-09-04 03:49:33.254022", + "step": 518, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:33.346763", + "step": 518, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05426869913935661, + "timestamp": "2025-09-04 03:49:33.363701", + "step": 519, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:49:33.458619", + "step": 519, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0028427981305867434, + "timestamp": "2025-09-04 03:49:33.476697", + "step": 520, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:49:41.838088", + "step": 520, + "epoch": 1 + }, + { + "type": "pplx", + "content": 331.47364260793984, + "timestamp": "2025-09-04 03:49:41.839937", + "step": 520, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 520", + "timestamp": "2025-09-04 03:49:42.197749", + "step": 520, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:49:42.304263", + "step": 520, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004516193643212318, + "timestamp": "2025-09-04 03:49:42.326503", + "step": 521, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:42.420198", + "step": 521, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029628755524754524, + "timestamp": "2025-09-04 03:49:42.437520", + "step": 522, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:42.537022", + "step": 522, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016332386061549187, + "timestamp": "2025-09-04 03:49:42.555319", + "step": 523, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:49:42.657450", + "step": 523, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12754516303539276, + "timestamp": "2025-09-04 03:49:42.677064", + "step": 524, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:42.767235", + "step": 524, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0154288774356246, + "timestamp": "2025-09-04 03:49:42.785875", + "step": 525, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:42.888440", + "step": 525, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04728075489401817, + "timestamp": "2025-09-04 03:49:42.907264", + "step": 526, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 03:49:42.974277", + "step": 526, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.039872948080301285, + "timestamp": "2025-09-04 03:49:42.985085", + "step": 527, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:49:43.103378", + "step": 527, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.15819469094276428, + "timestamp": "2025-09-04 03:49:43.126041", + "step": 528, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:43.214657", + "step": 528, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00868944637477398, + "timestamp": "2025-09-04 03:49:43.232835", + "step": 529, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:49:43.310728", + "step": 529, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.043173011392354965, + "timestamp": "2025-09-04 03:49:43.324708", + "step": 530, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:43.424559", + "step": 530, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009749419055879116, + "timestamp": "2025-09-04 03:49:43.443157", + "step": 531, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:43.542929", + "step": 531, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.060370367020368576, + "timestamp": "2025-09-04 03:49:43.562461", + "step": 532, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:49:43.646414", + "step": 532, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05955101177096367, + "timestamp": "2025-09-04 03:49:43.663498", + "step": 533, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:49:43.769394", + "step": 533, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011455871164798737, + "timestamp": "2025-09-04 03:49:43.789213", + "step": 534, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:43.883263", + "step": 534, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017603786662220955, + "timestamp": "2025-09-04 03:49:43.900564", + "step": 535, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:43.999470", + "step": 535, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004019308835268021, + "timestamp": "2025-09-04 03:49:44.018774", + "step": 536, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:44.108934", + "step": 536, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03117678314447403, + "timestamp": "2025-09-04 03:49:44.127529", + "step": 537, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:49:44.228887", + "step": 537, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02788373827934265, + "timestamp": "2025-09-04 03:49:44.247823", + "step": 538, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:49:44.323710", + "step": 538, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023979298770427704, + "timestamp": "2025-09-04 03:49:44.337235", + "step": 539, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:44.427420", + "step": 539, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0515667125582695, + "timestamp": "2025-09-04 03:49:44.444749", + "step": 540, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:49:52.800904", + "step": 540, + "epoch": 1 + }, + { + "type": "pplx", + "content": 328.1148207360572, + "timestamp": "2025-09-04 03:49:52.802715", + "step": 540, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:49:52.884845", + "step": 540, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034760650247335434, + "timestamp": "2025-09-04 03:49:52.901766", + "step": 541, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:49:53.030246", + "step": 541, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.061195723712444305, + "timestamp": "2025-09-04 03:49:53.053121", + "step": 542, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:53.147174", + "step": 542, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04390040412545204, + "timestamp": "2025-09-04 03:49:53.164465", + "step": 543, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:49:53.258655", + "step": 543, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01793227717280388, + "timestamp": "2025-09-04 03:49:53.276613", + "step": 544, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:49:53.414366", + "step": 544, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0374876894056797, + "timestamp": "2025-09-04 03:49:53.442685", + "step": 545, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:53.543275", + "step": 545, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08235947042703629, + "timestamp": "2025-09-04 03:49:53.561611", + "step": 546, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:53.656351", + "step": 546, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04841390252113342, + "timestamp": "2025-09-04 03:49:53.673261", + "step": 547, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:49:53.781485", + "step": 547, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01317357737571001, + "timestamp": "2025-09-04 03:49:53.802283", + "step": 548, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:53.893276", + "step": 548, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016209116205573082, + "timestamp": "2025-09-04 03:49:53.911823", + "step": 549, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:49:54.016291", + "step": 549, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08481376618146896, + "timestamp": "2025-09-04 03:49:54.035369", + "step": 550, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:49:54.144944", + "step": 550, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045819301158189774, + "timestamp": "2025-09-04 03:49:54.165459", + "step": 551, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:54.256078", + "step": 551, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02893437258899212, + "timestamp": "2025-09-04 03:49:54.273632", + "step": 552, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:49:54.393325", + "step": 552, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0218905508518219, + "timestamp": "2025-09-04 03:49:54.418542", + "step": 553, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:54.511444", + "step": 553, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03285536542534828, + "timestamp": "2025-09-04 03:49:54.528330", + "step": 554, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:49:54.630178", + "step": 554, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008444556035101414, + "timestamp": "2025-09-04 03:49:54.649073", + "step": 555, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:49:54.740380", + "step": 555, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04841528460383415, + "timestamp": "2025-09-04 03:49:54.757766", + "step": 556, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:49:54.848997", + "step": 556, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02788812294602394, + "timestamp": "2025-09-04 03:49:54.867556", + "step": 557, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:49:54.942951", + "step": 557, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03387841582298279, + "timestamp": "2025-09-04 03:49:54.956225", + "step": 558, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:49:55.056086", + "step": 558, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038461048156023026, + "timestamp": "2025-09-04 03:49:55.074363", + "step": 559, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:49:55.161963", + "step": 559, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025093531236052513, + "timestamp": "2025-09-04 03:49:55.177890", + "step": 560, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:03.607868", + "step": 560, + "epoch": 1 + }, + { + "type": "pplx", + "content": 323.1010053666772, + "timestamp": "2025-09-04 03:50:03.610489", + "step": 560, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 560", + "timestamp": "2025-09-04 03:50:04.111690", + "step": 560, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:50:04.194180", + "step": 560, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02403130754828453, + "timestamp": "2025-09-04 03:50:04.210903", + "step": 561, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:04.300277", + "step": 561, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010109679773449898, + "timestamp": "2025-09-04 03:50:04.316739", + "step": 562, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:04.418541", + "step": 562, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.053742896765470505, + "timestamp": "2025-09-04 03:50:04.436868", + "step": 563, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:50:04.547209", + "step": 563, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025198856368660927, + "timestamp": "2025-09-04 03:50:04.568436", + "step": 564, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:04.655610", + "step": 564, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04810674116015434, + "timestamp": "2025-09-04 03:50:04.673889", + "step": 565, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:50:04.782853", + "step": 565, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01729135774075985, + "timestamp": "2025-09-04 03:50:04.802978", + "step": 566, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:04.912535", + "step": 566, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03990844264626503, + "timestamp": "2025-09-04 03:50:04.932830", + "step": 567, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:50:05.016165", + "step": 567, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018527284264564514, + "timestamp": "2025-09-04 03:50:05.031879", + "step": 568, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:05.123611", + "step": 568, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04572749882936478, + "timestamp": "2025-09-04 03:50:05.142464", + "step": 569, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:05.251316", + "step": 569, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06394918262958527, + "timestamp": "2025-09-04 03:50:05.271592", + "step": 570, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:05.365527", + "step": 570, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023581665009260178, + "timestamp": "2025-09-04 03:50:05.382808", + "step": 571, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:05.485680", + "step": 571, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026009058579802513, + "timestamp": "2025-09-04 03:50:05.505623", + "step": 572, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:50:05.612207", + "step": 572, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011551128700375557, + "timestamp": "2025-09-04 03:50:05.634936", + "step": 573, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:05.737394", + "step": 573, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0341804064810276, + "timestamp": "2025-09-04 03:50:05.756726", + "step": 574, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:50:05.849571", + "step": 574, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09191818535327911, + "timestamp": "2025-09-04 03:50:05.866319", + "step": 575, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:05.966270", + "step": 575, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008478672243654728, + "timestamp": "2025-09-04 03:50:05.985773", + "step": 576, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:06.086160", + "step": 576, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10518831759691238, + "timestamp": "2025-09-04 03:50:06.107230", + "step": 577, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:06.205533", + "step": 577, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0350443534553051, + "timestamp": "2025-09-04 03:50:06.223903", + "step": 578, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:50:06.295644", + "step": 578, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03610123321413994, + "timestamp": "2025-09-04 03:50:06.308378", + "step": 579, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:06.402413", + "step": 579, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07018542289733887, + "timestamp": "2025-09-04 03:50:06.420355", + "step": 580, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:14.881907", + "step": 580, + "epoch": 1 + }, + { + "type": "pplx", + "content": 323.03369944740126, + "timestamp": "2025-09-04 03:50:14.884156", + "step": 580, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:14.980302", + "step": 580, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009829767979681492, + "timestamp": "2025-09-04 03:50:15.000981", + "step": 581, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 03:50:15.073314", + "step": 581, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01034059002995491, + "timestamp": "2025-09-04 03:50:15.085969", + "step": 582, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:15.182867", + "step": 582, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023535916581749916, + "timestamp": "2025-09-04 03:50:15.200207", + "step": 583, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:50:15.306387", + "step": 583, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048321448266506195, + "timestamp": "2025-09-04 03:50:15.326896", + "step": 584, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:15.426730", + "step": 584, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007726522162556648, + "timestamp": "2025-09-04 03:50:15.447627", + "step": 585, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:50:15.525155", + "step": 585, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10942022502422333, + "timestamp": "2025-09-04 03:50:15.538987", + "step": 586, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:50:15.645303", + "step": 586, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00408227788284421, + "timestamp": "2025-09-04 03:50:15.665165", + "step": 587, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:15.767654", + "step": 587, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013964972458779812, + "timestamp": "2025-09-04 03:50:15.787665", + "step": 588, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:15.894691", + "step": 588, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.061664290726184845, + "timestamp": "2025-09-04 03:50:15.916999", + "step": 589, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:50:16.000197", + "step": 589, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08124200999736786, + "timestamp": "2025-09-04 03:50:16.015088", + "step": 590, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:50:16.098629", + "step": 590, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012426053173840046, + "timestamp": "2025-09-04 03:50:16.113698", + "step": 591, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:16.213709", + "step": 591, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11045590788125992, + "timestamp": "2025-09-04 03:50:16.233343", + "step": 592, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:16.333263", + "step": 592, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03249969705939293, + "timestamp": "2025-09-04 03:50:16.354179", + "step": 593, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:50:16.464080", + "step": 593, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023777302354574203, + "timestamp": "2025-09-04 03:50:16.484351", + "step": 594, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:16.593894", + "step": 594, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017426855862140656, + "timestamp": "2025-09-04 03:50:16.614358", + "step": 595, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:16.715910", + "step": 595, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032828718423843384, + "timestamp": "2025-09-04 03:50:16.735355", + "step": 596, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:16.826539", + "step": 596, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08800698816776276, + "timestamp": "2025-09-04 03:50:16.845406", + "step": 597, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:16.939524", + "step": 597, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05674955993890762, + "timestamp": "2025-09-04 03:50:16.956622", + "step": 598, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:17.057539", + "step": 598, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02611662819981575, + "timestamp": "2025-09-04 03:50:17.076255", + "step": 599, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:50:17.154424", + "step": 599, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037351664155721664, + "timestamp": "2025-09-04 03:50:17.169108", + "step": 600, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:25.545854", + "step": 600, + "epoch": 1 + }, + { + "type": "pplx", + "content": 324.7015294662687, + "timestamp": "2025-09-04 03:50:25.548258", + "step": 600, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 600", + "timestamp": "2025-09-04 03:50:25.891967", + "step": 600, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:50:25.963785", + "step": 600, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05272102355957031, + "timestamp": "2025-09-04 03:50:25.978431", + "step": 601, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:26.080090", + "step": 601, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013583468273282051, + "timestamp": "2025-09-04 03:50:26.099071", + "step": 602, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:26.197206", + "step": 602, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021021874621510506, + "timestamp": "2025-09-04 03:50:26.215799", + "step": 603, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:26.310990", + "step": 603, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012546015903353691, + "timestamp": "2025-09-04 03:50:26.329269", + "step": 604, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:26.429291", + "step": 604, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04589983820915222, + "timestamp": "2025-09-04 03:50:26.450420", + "step": 605, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:26.545770", + "step": 605, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04309983178973198, + "timestamp": "2025-09-04 03:50:26.563268", + "step": 606, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:50:26.648264", + "step": 606, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06627572327852249, + "timestamp": "2025-09-04 03:50:26.663786", + "step": 607, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:26.758202", + "step": 607, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02102210372686386, + "timestamp": "2025-09-04 03:50:26.776450", + "step": 608, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:50:26.892497", + "step": 608, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04680265486240387, + "timestamp": "2025-09-04 03:50:26.916116", + "step": 609, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:27.019321", + "step": 609, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05080176889896393, + "timestamp": "2025-09-04 03:50:27.038659", + "step": 610, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:27.138655", + "step": 610, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02795407734811306, + "timestamp": "2025-09-04 03:50:27.157004", + "step": 611, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:50:27.231965", + "step": 611, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025690896436572075, + "timestamp": "2025-09-04 03:50:27.246524", + "step": 612, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:50:27.326510", + "step": 612, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011252271011471748, + "timestamp": "2025-09-04 03:50:27.343040", + "step": 613, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:50:27.426099", + "step": 613, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029573671519756317, + "timestamp": "2025-09-04 03:50:27.440961", + "step": 614, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:27.542987", + "step": 614, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03328055888414383, + "timestamp": "2025-09-04 03:50:27.562129", + "step": 615, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:27.656249", + "step": 615, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03198854997754097, + "timestamp": "2025-09-04 03:50:27.673816", + "step": 616, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:27.775050", + "step": 616, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05293620377779007, + "timestamp": "2025-09-04 03:50:27.795724", + "step": 617, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:50:27.918294", + "step": 617, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030497848987579346, + "timestamp": "2025-09-04 03:50:27.941330", + "step": 618, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:28.055770", + "step": 618, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003063701558858156, + "timestamp": "2025-09-04 03:50:28.074792", + "step": 619, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:50:28.196976", + "step": 619, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00672591058537364, + "timestamp": "2025-09-04 03:50:28.220908", + "step": 620, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:36.596005", + "step": 620, + "epoch": 1 + }, + { + "type": "pplx", + "content": 330.22229839916525, + "timestamp": "2025-09-04 03:50:36.598510", + "step": 620, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:50:36.680013", + "step": 620, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07592124491930008, + "timestamp": "2025-09-04 03:50:36.697115", + "step": 621, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:36.797854", + "step": 621, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03790687769651413, + "timestamp": "2025-09-04 03:50:36.816637", + "step": 622, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:50:36.894069", + "step": 622, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05864779278635979, + "timestamp": "2025-09-04 03:50:36.907933", + "step": 623, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:50:37.000229", + "step": 623, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04656538739800453, + "timestamp": "2025-09-04 03:50:37.017840", + "step": 624, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:37.106987", + "step": 624, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08541495352983475, + "timestamp": "2025-09-04 03:50:37.125351", + "step": 625, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:37.227295", + "step": 625, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019130367785692215, + "timestamp": "2025-09-04 03:50:37.245938", + "step": 626, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:37.339064", + "step": 626, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030174510553479195, + "timestamp": "2025-09-04 03:50:37.356225", + "step": 627, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:37.451088", + "step": 627, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012393724173307419, + "timestamp": "2025-09-04 03:50:37.469176", + "step": 628, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:37.557289", + "step": 628, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03318622708320618, + "timestamp": "2025-09-04 03:50:37.575472", + "step": 629, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:50:37.652296", + "step": 629, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.041939180344343185, + "timestamp": "2025-09-04 03:50:37.666329", + "step": 630, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:37.759762", + "step": 630, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01324822474271059, + "timestamp": "2025-09-04 03:50:37.777018", + "step": 631, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:50:37.855647", + "step": 631, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011515927501022816, + "timestamp": "2025-09-04 03:50:37.870228", + "step": 632, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:50:37.960942", + "step": 632, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03529660031199455, + "timestamp": "2025-09-04 03:50:37.979640", + "step": 633, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:38.081658", + "step": 633, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038785651326179504, + "timestamp": "2025-09-04 03:50:38.100818", + "step": 634, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 03:50:38.239722", + "step": 634, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014862718991935253, + "timestamp": "2025-09-04 03:50:38.265727", + "step": 635, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:38.366128", + "step": 635, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.054356202483177185, + "timestamp": "2025-09-04 03:50:38.385549", + "step": 636, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:38.485770", + "step": 636, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12962757050991058, + "timestamp": "2025-09-04 03:50:38.506762", + "step": 637, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:38.617492", + "step": 637, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04526481032371521, + "timestamp": "2025-09-04 03:50:38.635828", + "step": 638, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:38.739668", + "step": 638, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022485429421067238, + "timestamp": "2025-09-04 03:50:38.758702", + "step": 639, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:38.861201", + "step": 639, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01773577369749546, + "timestamp": "2025-09-04 03:50:38.880872", + "step": 640, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:47.264720", + "step": 640, + "epoch": 1 + }, + { + "type": "pplx", + "content": 331.5543720069065, + "timestamp": "2025-09-04 03:50:47.267008", + "step": 640, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 640", + "timestamp": "2025-09-04 03:50:47.629883", + "step": 640, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 03:50:47.702312", + "step": 640, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018003080040216446, + "timestamp": "2025-09-04 03:50:47.714561", + "step": 641, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:50:47.791539", + "step": 641, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01093088649213314, + "timestamp": "2025-09-04 03:50:47.805588", + "step": 642, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:47.905150", + "step": 642, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02146296203136444, + "timestamp": "2025-09-04 03:50:47.923461", + "step": 643, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:48.024088", + "step": 643, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04993215575814247, + "timestamp": "2025-09-04 03:50:48.043552", + "step": 644, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:50:48.143506", + "step": 644, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08081956207752228, + "timestamp": "2025-09-04 03:50:48.164354", + "step": 645, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:48.277592", + "step": 645, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018953487277030945, + "timestamp": "2025-09-04 03:50:48.297957", + "step": 646, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:48.396340", + "step": 646, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024002674967050552, + "timestamp": "2025-09-04 03:50:48.413527", + "step": 647, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:50:48.521196", + "step": 647, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004336449783295393, + "timestamp": "2025-09-04 03:50:48.542030", + "step": 648, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:48.633156", + "step": 648, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0077681634575128555, + "timestamp": "2025-09-04 03:50:48.652155", + "step": 649, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:48.745478", + "step": 649, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03083970956504345, + "timestamp": "2025-09-04 03:50:48.762789", + "step": 650, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:48.872150", + "step": 650, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012532150372862816, + "timestamp": "2025-09-04 03:50:48.892489", + "step": 651, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:48.993220", + "step": 651, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.060628145933151245, + "timestamp": "2025-09-04 03:50:49.012649", + "step": 652, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:49.103744", + "step": 652, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008690881542861462, + "timestamp": "2025-09-04 03:50:49.122962", + "step": 653, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:50:49.223690", + "step": 653, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014455270953476429, + "timestamp": "2025-09-04 03:50:49.242713", + "step": 654, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:49.336934", + "step": 654, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013105835765600204, + "timestamp": "2025-09-04 03:50:49.354425", + "step": 655, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:50:49.439534", + "step": 655, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07032399624586105, + "timestamp": "2025-09-04 03:50:49.455865", + "step": 656, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:50:49.528709", + "step": 656, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07407846301794052, + "timestamp": "2025-09-04 03:50:49.543655", + "step": 657, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:50:49.651803", + "step": 657, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009337909519672394, + "timestamp": "2025-09-04 03:50:49.672288", + "step": 658, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:49.766720", + "step": 658, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017982447519898415, + "timestamp": "2025-09-04 03:50:49.784007", + "step": 659, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:50:49.889673", + "step": 659, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009215055964887142, + "timestamp": "2025-09-04 03:50:49.910355", + "step": 660, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:50:58.286370", + "step": 660, + "epoch": 1 + }, + { + "type": "pplx", + "content": 330.0641296778058, + "timestamp": "2025-09-04 03:50:58.288230", + "step": 660, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:50:58.386848", + "step": 660, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02920273132622242, + "timestamp": "2025-09-04 03:50:58.408025", + "step": 661, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:50:58.498758", + "step": 661, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007981177419424057, + "timestamp": "2025-09-04 03:50:58.515280", + "step": 662, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:58.615295", + "step": 662, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017763612791895866, + "timestamp": "2025-09-04 03:50:58.633595", + "step": 663, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:50:58.734437", + "step": 663, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013931555673480034, + "timestamp": "2025-09-04 03:50:58.753624", + "step": 664, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:50:58.858226", + "step": 664, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013805502094328403, + "timestamp": "2025-09-04 03:50:58.880186", + "step": 665, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:50:58.975496", + "step": 665, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040271319448947906, + "timestamp": "2025-09-04 03:50:58.992762", + "step": 666, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:50:59.088259", + "step": 666, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008608100935816765, + "timestamp": "2025-09-04 03:50:59.105528", + "step": 667, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:50:59.215099", + "step": 667, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018997695297002792, + "timestamp": "2025-09-04 03:50:59.235830", + "step": 668, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:50:59.340368", + "step": 668, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04310667887330055, + "timestamp": "2025-09-04 03:50:59.362359", + "step": 669, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:50:59.436010", + "step": 669, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01862409897148609, + "timestamp": "2025-09-04 03:50:59.448669", + "step": 670, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:50:59.533601", + "step": 670, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032819997519254684, + "timestamp": "2025-09-04 03:50:59.548822", + "step": 671, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:50:59.655577", + "step": 671, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024017833173274994, + "timestamp": "2025-09-04 03:50:59.676063", + "step": 672, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:50:59.767083", + "step": 672, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026013823226094246, + "timestamp": "2025-09-04 03:50:59.785657", + "step": 673, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:50:59.879185", + "step": 673, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0160057432949543, + "timestamp": "2025-09-04 03:50:59.896099", + "step": 674, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:00.000068", + "step": 674, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016003988683223724, + "timestamp": "2025-09-04 03:51:00.019146", + "step": 675, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:51:00.115125", + "step": 675, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021060237661004066, + "timestamp": "2025-09-04 03:51:00.133211", + "step": 676, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:00.236367", + "step": 676, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05808281898498535, + "timestamp": "2025-09-04 03:51:00.258278", + "step": 677, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 03:51:00.421947", + "step": 677, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013201756402850151, + "timestamp": "2025-09-04 03:51:00.453778", + "step": 678, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:51:00.538723", + "step": 678, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02031007781624794, + "timestamp": "2025-09-04 03:51:00.553892", + "step": 679, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:00.657200", + "step": 679, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034204911440610886, + "timestamp": "2025-09-04 03:51:00.676840", + "step": 680, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:51:09.031779", + "step": 680, + "epoch": 1 + }, + { + "type": "pplx", + "content": 330.2300531347332, + "timestamp": "2025-09-04 03:51:09.033666", + "step": 680, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 680", + "timestamp": "2025-09-04 03:51:09.446722", + "step": 680, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:51:09.542085", + "step": 680, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06078000366687775, + "timestamp": "2025-09-04 03:51:09.562352", + "step": 681, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:09.664340", + "step": 681, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017026284709572792, + "timestamp": "2025-09-04 03:51:09.683255", + "step": 682, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:09.786570", + "step": 682, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03394745662808418, + "timestamp": "2025-09-04 03:51:09.805775", + "step": 683, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:09.908393", + "step": 683, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03497334569692612, + "timestamp": "2025-09-04 03:51:09.928050", + "step": 684, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:51:10.017677", + "step": 684, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025791537016630173, + "timestamp": "2025-09-04 03:51:10.036239", + "step": 685, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:51:10.152751", + "step": 685, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.049982357770204544, + "timestamp": "2025-09-04 03:51:10.174866", + "step": 686, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:10.265581", + "step": 686, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023276111111044884, + "timestamp": "2025-09-04 03:51:10.282308", + "step": 687, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:51:10.378533", + "step": 687, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016552327200770378, + "timestamp": "2025-09-04 03:51:10.396569", + "step": 688, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:51:10.513537", + "step": 688, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014692885801196098, + "timestamp": "2025-09-04 03:51:10.537386", + "step": 689, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:10.643966", + "step": 689, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08659341931343079, + "timestamp": "2025-09-04 03:51:10.663724", + "step": 690, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:10.741883", + "step": 690, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030853156000375748, + "timestamp": "2025-09-04 03:51:10.755931", + "step": 691, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:51:10.839386", + "step": 691, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030888579785823822, + "timestamp": "2025-09-04 03:51:10.854966", + "step": 692, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:51:10.936300", + "step": 692, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03811494633555412, + "timestamp": "2025-09-04 03:51:10.952811", + "step": 693, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:11.058803", + "step": 693, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013888245448470116, + "timestamp": "2025-09-04 03:51:11.078585", + "step": 694, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:51:11.214227", + "step": 694, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03252805024385452, + "timestamp": "2025-09-04 03:51:11.239966", + "step": 695, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:51:11.325914", + "step": 695, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.057707663625478745, + "timestamp": "2025-09-04 03:51:11.342329", + "step": 696, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:51:11.414814", + "step": 696, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013484358787536621, + "timestamp": "2025-09-04 03:51:11.429372", + "step": 697, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:51:11.501648", + "step": 697, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05915412679314613, + "timestamp": "2025-09-04 03:51:11.514316", + "step": 698, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:51:11.610602", + "step": 698, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024636950343847275, + "timestamp": "2025-09-04 03:51:11.627855", + "step": 699, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:51:11.721668", + "step": 699, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017293041571974754, + "timestamp": "2025-09-04 03:51:11.739406", + "step": 700, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:51:20.115650", + "step": 700, + "epoch": 1 + }, + { + "type": "pplx", + "content": 334.05662580498205, + "timestamp": "2025-09-04 03:51:20.117602", + "step": 700, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:20.216569", + "step": 700, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020366515964269638, + "timestamp": "2025-09-04 03:51:20.237665", + "step": 701, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:20.345409", + "step": 701, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03450450301170349, + "timestamp": "2025-09-04 03:51:20.365186", + "step": 702, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:20.442718", + "step": 702, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031207023188471794, + "timestamp": "2025-09-04 03:51:20.456727", + "step": 703, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:20.548080", + "step": 703, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09872627258300781, + "timestamp": "2025-09-04 03:51:20.565328", + "step": 704, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:20.663398", + "step": 704, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03153887018561363, + "timestamp": "2025-09-04 03:51:20.683915", + "step": 705, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:20.786006", + "step": 705, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025656569749116898, + "timestamp": "2025-09-04 03:51:20.805069", + "step": 706, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:20.905492", + "step": 706, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007625031750649214, + "timestamp": "2025-09-04 03:51:20.924018", + "step": 707, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:21.001433", + "step": 707, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013403641991317272, + "timestamp": "2025-09-04 03:51:21.016042", + "step": 708, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:51:21.089639", + "step": 708, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04379529878497124, + "timestamp": "2025-09-04 03:51:21.104530", + "step": 709, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:21.210734", + "step": 709, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03386307880282402, + "timestamp": "2025-09-04 03:51:21.230561", + "step": 710, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:21.339667", + "step": 710, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023136505857110023, + "timestamp": "2025-09-04 03:51:21.360277", + "step": 711, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:21.470082", + "step": 711, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01604538969695568, + "timestamp": "2025-09-04 03:51:21.491196", + "step": 712, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:51:21.574067", + "step": 712, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0386708602309227, + "timestamp": "2025-09-04 03:51:21.590838", + "step": 713, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:21.701235", + "step": 713, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00971259456127882, + "timestamp": "2025-09-04 03:51:21.721504", + "step": 714, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:21.812160", + "step": 714, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06396154314279556, + "timestamp": "2025-09-04 03:51:21.829002", + "step": 715, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:51:21.928434", + "step": 715, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.044354457408189774, + "timestamp": "2025-09-04 03:51:21.947527", + "step": 716, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:22.037338", + "step": 716, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02656322531402111, + "timestamp": "2025-09-04 03:51:22.055648", + "step": 717, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:51:22.134993", + "step": 717, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03358420729637146, + "timestamp": "2025-09-04 03:51:22.148882", + "step": 718, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:22.250678", + "step": 718, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008803064003586769, + "timestamp": "2025-09-04 03:51:22.269676", + "step": 719, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:22.360223", + "step": 719, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1287224441766739, + "timestamp": "2025-09-04 03:51:22.377492", + "step": 720, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:51:30.762033", + "step": 720, + "epoch": 1 + }, + { + "type": "pplx", + "content": 336.26355182577953, + "timestamp": "2025-09-04 03:51:30.763799", + "step": 720, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 720", + "timestamp": "2025-09-04 03:51:31.234372", + "step": 720, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:51:31.352125", + "step": 720, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00398655841127038, + "timestamp": "2025-09-04 03:51:31.377462", + "step": 721, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:51:31.461725", + "step": 721, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007859241217374802, + "timestamp": "2025-09-04 03:51:31.477012", + "step": 722, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:51:31.571329", + "step": 722, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027930831536650658, + "timestamp": "2025-09-04 03:51:31.588467", + "step": 723, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:51:31.687794", + "step": 723, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008873346261680126, + "timestamp": "2025-09-04 03:51:31.706953", + "step": 724, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:31.783615", + "step": 724, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06903643906116486, + "timestamp": "2025-09-04 03:51:31.798877", + "step": 725, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:31.901354", + "step": 725, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05149473994970322, + "timestamp": "2025-09-04 03:51:31.920403", + "step": 726, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:51:31.996940", + "step": 726, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0763927474617958, + "timestamp": "2025-09-04 03:51:32.010521", + "step": 727, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:32.113768", + "step": 727, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0026997928507626057, + "timestamp": "2025-09-04 03:51:32.133545", + "step": 728, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:51:32.210082", + "step": 728, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0394502729177475, + "timestamp": "2025-09-04 03:51:32.225171", + "step": 729, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:32.334298", + "step": 729, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009021886624395847, + "timestamp": "2025-09-04 03:51:32.354559", + "step": 730, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:51:32.442076", + "step": 730, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00816608127206564, + "timestamp": "2025-09-04 03:51:32.457490", + "step": 731, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 03:51:32.583086", + "step": 731, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03670935705304146, + "timestamp": "2025-09-04 03:51:32.607621", + "step": 732, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:51:32.699631", + "step": 732, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056674499064683914, + "timestamp": "2025-09-04 03:51:32.718181", + "step": 733, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:51:32.803873", + "step": 733, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.14876963198184967, + "timestamp": "2025-09-04 03:51:32.818788", + "step": 734, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:51:32.912540", + "step": 734, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012708455324172974, + "timestamp": "2025-09-04 03:51:32.929701", + "step": 735, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:33.039167", + "step": 735, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09964840114116669, + "timestamp": "2025-09-04 03:51:33.060181", + "step": 736, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:33.161479", + "step": 736, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03136486932635307, + "timestamp": "2025-09-04 03:51:33.181948", + "step": 737, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:33.285762", + "step": 737, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04356590285897255, + "timestamp": "2025-09-04 03:51:33.304795", + "step": 738, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:33.405495", + "step": 738, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01183301117271185, + "timestamp": "2025-09-04 03:51:33.424143", + "step": 739, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:33.503460", + "step": 739, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02132677659392357, + "timestamp": "2025-09-04 03:51:33.518178", + "step": 740, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:51:41.890219", + "step": 740, + "epoch": 1 + }, + { + "type": "pplx", + "content": 330.76754669519477, + "timestamp": "2025-09-04 03:51:41.892697", + "step": 740, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:41.994167", + "step": 740, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.044686876237392426, + "timestamp": "2025-09-04 03:51:42.015959", + "step": 741, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:42.126745", + "step": 741, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04039061442017555, + "timestamp": "2025-09-04 03:51:42.147095", + "step": 742, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:42.249214", + "step": 742, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022664330899715424, + "timestamp": "2025-09-04 03:51:42.267518", + "step": 743, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:51:42.364434", + "step": 743, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02850925363600254, + "timestamp": "2025-09-04 03:51:42.382084", + "step": 744, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:42.472301", + "step": 744, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03210241347551346, + "timestamp": "2025-09-04 03:51:42.490534", + "step": 745, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:51:42.570864", + "step": 745, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0489024817943573, + "timestamp": "2025-09-04 03:51:42.584554", + "step": 746, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:51:42.693167", + "step": 746, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.034052006900310516, + "timestamp": "2025-09-04 03:51:42.713106", + "step": 747, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:51:42.830107", + "step": 747, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012416801415383816, + "timestamp": "2025-09-04 03:51:42.852718", + "step": 748, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:51:42.960820", + "step": 748, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05252353101968765, + "timestamp": "2025-09-04 03:51:42.983407", + "step": 749, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:51:43.060970", + "step": 749, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021606309339404106, + "timestamp": "2025-09-04 03:51:43.074914", + "step": 750, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:43.165463", + "step": 750, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009972968138754368, + "timestamp": "2025-09-04 03:51:43.182006", + "step": 751, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:51:43.292002", + "step": 751, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017765356227755547, + "timestamp": "2025-09-04 03:51:43.313301", + "step": 752, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:43.411771", + "step": 752, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04759746789932251, + "timestamp": "2025-09-04 03:51:43.432225", + "step": 753, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:51:43.540540", + "step": 753, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023347793146967888, + "timestamp": "2025-09-04 03:51:43.560567", + "step": 754, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:51:43.644484", + "step": 754, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04734707623720169, + "timestamp": "2025-09-04 03:51:43.659595", + "step": 755, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:43.768987", + "step": 755, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026827994734048843, + "timestamp": "2025-09-04 03:51:43.790268", + "step": 756, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:51:43.866644", + "step": 756, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.055104807019233704, + "timestamp": "2025-09-04 03:51:43.881921", + "step": 757, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:43.984924", + "step": 757, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03936297073960304, + "timestamp": "2025-09-04 03:51:44.004115", + "step": 758, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:51:44.090732", + "step": 758, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07558504492044449, + "timestamp": "2025-09-04 03:51:44.106115", + "step": 759, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:51:44.208853", + "step": 759, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011148090474307537, + "timestamp": "2025-09-04 03:51:44.228842", + "step": 760, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:51:52.633240", + "step": 760, + "epoch": 1 + }, + { + "type": "pplx", + "content": 325.2410786741373, + "timestamp": "2025-09-04 03:51:52.635164", + "step": 760, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 760", + "timestamp": "2025-09-04 03:51:53.016229", + "step": 760, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:51:53.098194", + "step": 760, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038804348558187485, + "timestamp": "2025-09-04 03:51:53.114486", + "step": 761, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:53.225215", + "step": 761, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09771209210157394, + "timestamp": "2025-09-04 03:51:53.245500", + "step": 762, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:51:53.339628", + "step": 762, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015294702723622322, + "timestamp": "2025-09-04 03:51:53.356673", + "step": 763, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:53.457292", + "step": 763, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06018395349383354, + "timestamp": "2025-09-04 03:51:53.476673", + "step": 764, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:51:53.573539", + "step": 764, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015591312199831009, + "timestamp": "2025-09-04 03:51:53.593665", + "step": 765, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:51:53.686695", + "step": 765, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04299828037619591, + "timestamp": "2025-09-04 03:51:53.703554", + "step": 766, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:53.806707", + "step": 766, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06639469414949417, + "timestamp": "2025-09-04 03:51:53.825270", + "step": 767, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:51:53.927352", + "step": 767, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06221333146095276, + "timestamp": "2025-09-04 03:51:53.947041", + "step": 768, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:51:54.053386", + "step": 768, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014062750153243542, + "timestamp": "2025-09-04 03:51:54.075653", + "step": 769, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:51:54.184801", + "step": 769, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05087198689579964, + "timestamp": "2025-09-04 03:51:54.204773", + "step": 770, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:54.305488", + "step": 770, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021481147035956383, + "timestamp": "2025-09-04 03:51:54.324163", + "step": 771, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 03:51:54.394241", + "step": 771, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031000670045614243, + "timestamp": "2025-09-04 03:51:54.407850", + "step": 772, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:51:54.499948", + "step": 772, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0011418497888371348, + "timestamp": "2025-09-04 03:51:54.518917", + "step": 773, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:51:54.625581", + "step": 773, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02311205305159092, + "timestamp": "2025-09-04 03:51:54.645492", + "step": 774, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 03:51:54.780476", + "step": 774, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02674124389886856, + "timestamp": "2025-09-04 03:51:54.806404", + "step": 775, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:51:54.915425", + "step": 775, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04661679267883301, + "timestamp": "2025-09-04 03:51:54.936816", + "step": 776, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:51:55.019120", + "step": 776, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04548424482345581, + "timestamp": "2025-09-04 03:51:55.036198", + "step": 777, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:51:55.125560", + "step": 777, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007166591938585043, + "timestamp": "2025-09-04 03:51:55.142205", + "step": 778, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:55.244199", + "step": 778, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06729776412248611, + "timestamp": "2025-09-04 03:51:55.262780", + "step": 779, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:51:55.362266", + "step": 779, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007628207094967365, + "timestamp": "2025-09-04 03:51:55.381603", + "step": 780, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:03.772736", + "step": 780, + "epoch": 1 + }, + { + "type": "pplx", + "content": 323.20181798344566, + "timestamp": "2025-09-04 03:52:03.775082", + "step": 780, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:52:03.857745", + "step": 780, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004363184329122305, + "timestamp": "2025-09-04 03:52:03.874922", + "step": 781, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:03.975884", + "step": 781, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01874414086341858, + "timestamp": "2025-09-04 03:52:03.994665", + "step": 782, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:52:04.082345", + "step": 782, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0951443463563919, + "timestamp": "2025-09-04 03:52:04.097756", + "step": 783, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:04.192694", + "step": 783, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09938696026802063, + "timestamp": "2025-09-04 03:52:04.210641", + "step": 784, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:52:04.323748", + "step": 784, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010282545350492, + "timestamp": "2025-09-04 03:52:04.347897", + "step": 785, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:04.446431", + "step": 785, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1636459231376648, + "timestamp": "2025-09-04 03:52:04.465085", + "step": 786, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:52:04.551108", + "step": 786, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022835776209831238, + "timestamp": "2025-09-04 03:52:04.566559", + "step": 787, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:52:04.688871", + "step": 787, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004113573580980301, + "timestamp": "2025-09-04 03:52:04.712558", + "step": 788, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:04.813658", + "step": 788, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006749707739800215, + "timestamp": "2025-09-04 03:52:04.834669", + "step": 789, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:04.935197", + "step": 789, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04576265811920166, + "timestamp": "2025-09-04 03:52:04.953879", + "step": 790, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:05.047740", + "step": 790, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024862490594387054, + "timestamp": "2025-09-04 03:52:05.065114", + "step": 791, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:05.165236", + "step": 791, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006702667102217674, + "timestamp": "2025-09-04 03:52:05.184551", + "step": 792, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:05.276679", + "step": 792, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017640264704823494, + "timestamp": "2025-09-04 03:52:05.295657", + "step": 793, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:05.390345", + "step": 793, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03208797425031662, + "timestamp": "2025-09-04 03:52:05.407251", + "step": 794, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:05.508560", + "step": 794, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03511238843202591, + "timestamp": "2025-09-04 03:52:05.527172", + "step": 795, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:05.620511", + "step": 795, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08914028853178024, + "timestamp": "2025-09-04 03:52:05.638141", + "step": 796, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:05.741354", + "step": 796, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08203835040330887, + "timestamp": "2025-09-04 03:52:05.763282", + "step": 797, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:05.867104", + "step": 797, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04981414973735809, + "timestamp": "2025-09-04 03:52:05.886381", + "step": 798, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:05.985482", + "step": 798, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007873651571571827, + "timestamp": "2025-09-04 03:52:06.004060", + "step": 799, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:06.106641", + "step": 799, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008369551040232182, + "timestamp": "2025-09-04 03:52:06.125955", + "step": 800, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:14.582165", + "step": 800, + "epoch": 1 + }, + { + "type": "pplx", + "content": 322.3950710812259, + "timestamp": "2025-09-04 03:52:14.584469", + "step": 800, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 800", + "timestamp": "2025-09-04 03:52:14.943950", + "step": 800, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:15.032165", + "step": 800, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013865088112652302, + "timestamp": "2025-09-04 03:52:15.050000", + "step": 801, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:52:15.145702", + "step": 801, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011338443495333195, + "timestamp": "2025-09-04 03:52:15.162426", + "step": 802, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:15.266835", + "step": 802, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01849699579179287, + "timestamp": "2025-09-04 03:52:15.285675", + "step": 803, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 03:52:15.424546", + "step": 803, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.046258725225925446, + "timestamp": "2025-09-04 03:52:15.450883", + "step": 804, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:15.541096", + "step": 804, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06461029499769211, + "timestamp": "2025-09-04 03:52:15.558768", + "step": 805, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:15.650764", + "step": 805, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010986247099936008, + "timestamp": "2025-09-04 03:52:15.667207", + "step": 806, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:15.771785", + "step": 806, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.002984520047903061, + "timestamp": "2025-09-04 03:52:15.790613", + "step": 807, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:15.883372", + "step": 807, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014198306016623974, + "timestamp": "2025-09-04 03:52:15.900243", + "step": 808, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:16.002786", + "step": 808, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03292492404580116, + "timestamp": "2025-09-04 03:52:16.023360", + "step": 809, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:16.128536", + "step": 809, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031847354024648666, + "timestamp": "2025-09-04 03:52:16.147172", + "step": 810, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:52:16.225296", + "step": 810, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0046399482525885105, + "timestamp": "2025-09-04 03:52:16.238472", + "step": 811, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:16.350870", + "step": 811, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028976459056138992, + "timestamp": "2025-09-04 03:52:16.371461", + "step": 812, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:16.471884", + "step": 812, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06571343541145325, + "timestamp": "2025-09-04 03:52:16.491658", + "step": 813, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:16.597372", + "step": 813, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004271671175956726, + "timestamp": "2025-09-04 03:52:16.615972", + "step": 814, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:16.721553", + "step": 814, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04265111684799194, + "timestamp": "2025-09-04 03:52:16.740168", + "step": 815, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:16.843856", + "step": 815, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0795649066567421, + "timestamp": "2025-09-04 03:52:16.862808", + "step": 816, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:16.965373", + "step": 816, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007744000293314457, + "timestamp": "2025-09-04 03:52:16.985818", + "step": 817, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:52:17.097956", + "step": 817, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014986629597842693, + "timestamp": "2025-09-04 03:52:17.118420", + "step": 818, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:52:17.240914", + "step": 818, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019025737419724464, + "timestamp": "2025-09-04 03:52:17.262418", + "step": 819, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:17.369933", + "step": 819, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05677224323153496, + "timestamp": "2025-09-04 03:52:17.390488", + "step": 820, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:25.880722", + "step": 820, + "epoch": 1 + }, + { + "type": "pplx", + "content": 324.48675427637124, + "timestamp": "2025-09-04 03:52:25.882971", + "step": 820, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:25.983499", + "step": 820, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029658645391464233, + "timestamp": "2025-09-04 03:52:26.004547", + "step": 821, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:52:26.093326", + "step": 821, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01038702204823494, + "timestamp": "2025-09-04 03:52:26.108769", + "step": 822, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:26.214631", + "step": 822, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.001975145423784852, + "timestamp": "2025-09-04 03:52:26.231760", + "step": 823, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 03:52:26.416298", + "step": 823, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0497569777071476, + "timestamp": "2025-09-04 03:52:26.451588", + "step": 824, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:26.569750", + "step": 824, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02662699669599533, + "timestamp": "2025-09-04 03:52:26.588463", + "step": 825, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:26.697814", + "step": 825, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0038292373064905405, + "timestamp": "2025-09-04 03:52:26.718387", + "step": 826, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:26.821113", + "step": 826, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07293792068958282, + "timestamp": "2025-09-04 03:52:26.840094", + "step": 827, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:52:26.923604", + "step": 827, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.13208356499671936, + "timestamp": "2025-09-04 03:52:26.939479", + "step": 828, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:52:27.024504", + "step": 828, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03219275176525116, + "timestamp": "2025-09-04 03:52:27.041444", + "step": 829, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:27.131547", + "step": 829, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00648118881508708, + "timestamp": "2025-09-04 03:52:27.148096", + "step": 830, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:27.248076", + "step": 830, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04718891531229019, + "timestamp": "2025-09-04 03:52:27.266378", + "step": 831, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:27.365554", + "step": 831, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004248224198818207, + "timestamp": "2025-09-04 03:52:27.384715", + "step": 832, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 03:52:27.565166", + "step": 832, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010911565274000168, + "timestamp": "2025-09-04 03:52:27.603065", + "step": 833, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:27.702939", + "step": 833, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.036790501326322556, + "timestamp": "2025-09-04 03:52:27.721421", + "step": 834, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:52:27.798834", + "step": 834, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08580999821424484, + "timestamp": "2025-09-04 03:52:27.812651", + "step": 835, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:27.912675", + "step": 835, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00923218298703432, + "timestamp": "2025-09-04 03:52:27.932194", + "step": 836, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:28.031507", + "step": 836, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007355087902396917, + "timestamp": "2025-09-04 03:52:28.052146", + "step": 837, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:28.148182", + "step": 837, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017146985977888107, + "timestamp": "2025-09-04 03:52:28.165495", + "step": 838, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:28.267178", + "step": 838, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021363992244005203, + "timestamp": "2025-09-04 03:52:28.286139", + "step": 839, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:28.395064", + "step": 839, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07390808314085007, + "timestamp": "2025-09-04 03:52:28.416262", + "step": 840, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:36.789891", + "step": 840, + "epoch": 1 + }, + { + "type": "pplx", + "content": 324.41111865398125, + "timestamp": "2025-09-04 03:52:36.791946", + "step": 840, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 840", + "timestamp": "2025-09-04 03:52:37.138794", + "step": 840, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:37.228808", + "step": 840, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008902303874492645, + "timestamp": "2025-09-04 03:52:37.247382", + "step": 841, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:52:37.322184", + "step": 841, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04457436874508858, + "timestamp": "2025-09-04 03:52:37.335432", + "step": 842, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:52:37.443107", + "step": 842, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011131439357995987, + "timestamp": "2025-09-04 03:52:37.463335", + "step": 843, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:37.571948", + "step": 843, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03997879475355148, + "timestamp": "2025-09-04 03:52:37.593109", + "step": 844, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:37.681186", + "step": 844, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05826606974005699, + "timestamp": "2025-09-04 03:52:37.699242", + "step": 845, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:37.798938", + "step": 845, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013584684580564499, + "timestamp": "2025-09-04 03:52:37.817704", + "step": 846, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 03:52:38.022798", + "step": 846, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00445243064314127, + "timestamp": "2025-09-04 03:52:38.061808", + "step": 847, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:52:38.163815", + "step": 847, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022547859698534012, + "timestamp": "2025-09-04 03:52:38.183460", + "step": 848, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:52:38.289760", + "step": 848, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02277173474431038, + "timestamp": "2025-09-04 03:52:38.312237", + "step": 849, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:52:38.396170", + "step": 849, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.055064212530851364, + "timestamp": "2025-09-04 03:52:38.411132", + "step": 850, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:52:38.494339", + "step": 850, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009578646160662174, + "timestamp": "2025-09-04 03:52:38.509205", + "step": 851, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:38.603857", + "step": 851, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006946980953216553, + "timestamp": "2025-09-04 03:52:38.621878", + "step": 852, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:38.723300", + "step": 852, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10980971902608871, + "timestamp": "2025-09-04 03:52:38.744250", + "step": 853, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:38.836222", + "step": 853, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02813224121928215, + "timestamp": "2025-09-04 03:52:38.852773", + "step": 854, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:38.952308", + "step": 854, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04458193480968475, + "timestamp": "2025-09-04 03:52:38.970909", + "step": 855, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:52:39.048237", + "step": 855, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03821039944887161, + "timestamp": "2025-09-04 03:52:39.062827", + "step": 856, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:39.152919", + "step": 856, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032031524926424026, + "timestamp": "2025-09-04 03:52:39.171567", + "step": 857, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:52:39.266660", + "step": 857, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026080451905727386, + "timestamp": "2025-09-04 03:52:39.283145", + "step": 858, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:39.393922", + "step": 858, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06203945353627205, + "timestamp": "2025-09-04 03:52:39.414479", + "step": 859, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:39.518098", + "step": 859, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0699310228228569, + "timestamp": "2025-09-04 03:52:39.537950", + "step": 860, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:47.899492", + "step": 860, + "epoch": 1 + }, + { + "type": "pplx", + "content": 322.49340806315365, + "timestamp": "2025-09-04 03:52:47.901562", + "step": 860, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:48.003512", + "step": 860, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03510546684265137, + "timestamp": "2025-09-04 03:52:48.025476", + "step": 861, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:48.129042", + "step": 861, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010071228258311749, + "timestamp": "2025-09-04 03:52:48.148376", + "step": 862, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:48.251091", + "step": 862, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03212030977010727, + "timestamp": "2025-09-04 03:52:48.270097", + "step": 863, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:48.375653", + "step": 863, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09204772859811783, + "timestamp": "2025-09-04 03:52:48.396300", + "step": 864, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:48.486243", + "step": 864, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03038841113448143, + "timestamp": "2025-09-04 03:52:48.504904", + "step": 865, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:48.606251", + "step": 865, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.058843135833740234, + "timestamp": "2025-09-04 03:52:48.624971", + "step": 866, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 03:52:48.753998", + "step": 866, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011757034808397293, + "timestamp": "2025-09-04 03:52:48.778443", + "step": 867, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:48.883752", + "step": 867, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.055646833032369614, + "timestamp": "2025-09-04 03:52:48.903198", + "step": 868, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:48.999421", + "step": 868, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007672054693102837, + "timestamp": "2025-09-04 03:52:49.019789", + "step": 869, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:49.126050", + "step": 869, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04655780270695686, + "timestamp": "2025-09-04 03:52:49.145853", + "step": 870, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:52:49.239252", + "step": 870, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009051861241459846, + "timestamp": "2025-09-04 03:52:49.256412", + "step": 871, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:52:49.340677", + "step": 871, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013100282289087772, + "timestamp": "2025-09-04 03:52:49.356697", + "step": 872, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:52:49.437905", + "step": 872, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017356760799884796, + "timestamp": "2025-09-04 03:52:49.454466", + "step": 873, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:49.557371", + "step": 873, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02461932599544525, + "timestamp": "2025-09-04 03:52:49.576385", + "step": 874, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:52:49.683118", + "step": 874, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040192510932683945, + "timestamp": "2025-09-04 03:52:49.702845", + "step": 875, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:49.803670", + "step": 875, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04767023026943207, + "timestamp": "2025-09-04 03:52:49.823120", + "step": 876, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:49.930415", + "step": 876, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0067078592255711555, + "timestamp": "2025-09-04 03:52:49.952767", + "step": 877, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:52:50.051711", + "step": 877, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0437939316034317, + "timestamp": "2025-09-04 03:52:50.070336", + "step": 878, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:52:50.180351", + "step": 878, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00434476463124156, + "timestamp": "2025-09-04 03:52:50.200624", + "step": 879, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:50.293161", + "step": 879, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02230999432504177, + "timestamp": "2025-09-04 03:52:50.310845", + "step": 880, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:52:58.696210", + "step": 880, + "epoch": 1 + }, + { + "type": "pplx", + "content": 320.0679803850578, + "timestamp": "2025-09-04 03:52:58.697942", + "step": 880, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 880", + "timestamp": "2025-09-04 03:52:59.217621", + "step": 880, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:59.308241", + "step": 880, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021877720952033997, + "timestamp": "2025-09-04 03:52:59.326920", + "step": 881, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:52:59.423770", + "step": 881, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03120221011340618, + "timestamp": "2025-09-04 03:52:59.441082", + "step": 882, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:59.533374", + "step": 882, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017093293368816376, + "timestamp": "2025-09-04 03:52:59.550315", + "step": 883, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:52:59.654042", + "step": 883, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045136742293834686, + "timestamp": "2025-09-04 03:52:59.674000", + "step": 884, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:52:59.771460", + "step": 884, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06527303159236908, + "timestamp": "2025-09-04 03:52:59.792160", + "step": 885, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:52:59.887195", + "step": 885, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01749110408127308, + "timestamp": "2025-09-04 03:52:59.904116", + "step": 886, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:00.010118", + "step": 886, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006269685458391905, + "timestamp": "2025-09-04 03:53:00.029842", + "step": 887, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:00.134196", + "step": 887, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012028466910123825, + "timestamp": "2025-09-04 03:53:00.154044", + "step": 888, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:53:00.265849", + "step": 888, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030681060627102852, + "timestamp": "2025-09-04 03:53:00.288336", + "step": 889, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:00.394509", + "step": 889, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014401630498468876, + "timestamp": "2025-09-04 03:53:00.413831", + "step": 890, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:53:00.497526", + "step": 890, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04923472926020622, + "timestamp": "2025-09-04 03:53:00.512537", + "step": 891, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:00.611406", + "step": 891, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056294139474630356, + "timestamp": "2025-09-04 03:53:00.630919", + "step": 892, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:53:00.735420", + "step": 892, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05868148058652878, + "timestamp": "2025-09-04 03:53:00.755788", + "step": 893, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:00.857660", + "step": 893, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04890258610248566, + "timestamp": "2025-09-04 03:53:00.876619", + "step": 894, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:00.976525", + "step": 894, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024650558829307556, + "timestamp": "2025-09-04 03:53:00.995278", + "step": 895, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:01.094960", + "step": 895, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007203355897217989, + "timestamp": "2025-09-04 03:53:01.114386", + "step": 896, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:01.215264", + "step": 896, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04748008772730827, + "timestamp": "2025-09-04 03:53:01.236042", + "step": 897, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:01.342527", + "step": 897, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011301451362669468, + "timestamp": "2025-09-04 03:53:01.362187", + "step": 898, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:01.452563", + "step": 898, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03385727107524872, + "timestamp": "2025-09-04 03:53:01.469235", + "step": 899, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:01.575911", + "step": 899, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037546951323747635, + "timestamp": "2025-09-04 03:53:01.595843", + "step": 900, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:53:09.952980", + "step": 900, + "epoch": 1 + }, + { + "type": "pplx", + "content": 319.89447585761627, + "timestamp": "2025-09-04 03:53:09.954474", + "step": 900, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:10.052039", + "step": 900, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01678859256207943, + "timestamp": "2025-09-04 03:53:10.073130", + "step": 901, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:10.166839", + "step": 901, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.038133881986141205, + "timestamp": "2025-09-04 03:53:10.183993", + "step": 902, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:10.274791", + "step": 902, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019613448530435562, + "timestamp": "2025-09-04 03:53:10.291441", + "step": 903, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:53:10.379595", + "step": 903, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020688189193606377, + "timestamp": "2025-09-04 03:53:10.395757", + "step": 904, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:10.487110", + "step": 904, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005966340657323599, + "timestamp": "2025-09-04 03:53:10.505940", + "step": 905, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:10.607711", + "step": 905, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01767931506037712, + "timestamp": "2025-09-04 03:53:10.626611", + "step": 906, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:10.719273", + "step": 906, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02633294090628624, + "timestamp": "2025-09-04 03:53:10.736157", + "step": 907, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:10.839880", + "step": 907, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03189924731850624, + "timestamp": "2025-09-04 03:53:10.859731", + "step": 908, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:10.962770", + "step": 908, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003187986556440592, + "timestamp": "2025-09-04 03:53:10.984587", + "step": 909, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:11.084774", + "step": 909, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07128451019525528, + "timestamp": "2025-09-04 03:53:11.101434", + "step": 910, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:11.213242", + "step": 910, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.048821836709976196, + "timestamp": "2025-09-04 03:53:11.232355", + "step": 911, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:11.347294", + "step": 911, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00408367533236742, + "timestamp": "2025-09-04 03:53:11.366967", + "step": 912, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:11.473704", + "step": 912, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04687316343188286, + "timestamp": "2025-09-04 03:53:11.492664", + "step": 913, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:11.586826", + "step": 913, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.035709548741579056, + "timestamp": "2025-09-04 03:53:11.603966", + "step": 914, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:11.707434", + "step": 914, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0572127141058445, + "timestamp": "2025-09-04 03:53:11.726695", + "step": 915, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:11.816754", + "step": 915, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09570334106683731, + "timestamp": "2025-09-04 03:53:11.834156", + "step": 916, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:11.932053", + "step": 916, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05318867787718773, + "timestamp": "2025-09-04 03:53:11.952724", + "step": 917, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:12.054726", + "step": 917, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008588760159909725, + "timestamp": "2025-09-04 03:53:12.073695", + "step": 918, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:53:12.212450", + "step": 918, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007535313721746206, + "timestamp": "2025-09-04 03:53:12.238373", + "step": 919, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:12.332103", + "step": 919, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020334357395768166, + "timestamp": "2025-09-04 03:53:12.350325", + "step": 920, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:53:20.755706", + "step": 920, + "epoch": 1 + }, + { + "type": "pplx", + "content": 323.29331082900484, + "timestamp": "2025-09-04 03:53:20.757985", + "step": 920, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 920", + "timestamp": "2025-09-04 03:53:21.106061", + "step": 920, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:21.206709", + "step": 920, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004637205507606268, + "timestamp": "2025-09-04 03:53:21.227753", + "step": 921, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:53:21.303208", + "step": 921, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02929893508553505, + "timestamp": "2025-09-04 03:53:21.316393", + "step": 922, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:21.409671", + "step": 922, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05244254320859909, + "timestamp": "2025-09-04 03:53:21.426608", + "step": 923, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:53:21.561973", + "step": 923, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014228380285203457, + "timestamp": "2025-09-04 03:53:21.588607", + "step": 924, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:21.689260", + "step": 924, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014327945187687874, + "timestamp": "2025-09-04 03:53:21.710185", + "step": 925, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:53:21.806681", + "step": 925, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04167867451906204, + "timestamp": "2025-09-04 03:53:21.823957", + "step": 926, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:21.914206", + "step": 926, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004316235426813364, + "timestamp": "2025-09-04 03:53:21.930710", + "step": 927, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:53:22.006415", + "step": 927, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04848542809486389, + "timestamp": "2025-09-04 03:53:22.020751", + "step": 928, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:53:22.117538", + "step": 928, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008897616527974606, + "timestamp": "2025-09-04 03:53:22.137654", + "step": 929, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:53:22.246947", + "step": 929, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006201483774930239, + "timestamp": "2025-09-04 03:53:22.267302", + "step": 930, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:22.361441", + "step": 930, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06400009244680405, + "timestamp": "2025-09-04 03:53:22.378595", + "step": 931, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:53:22.502905", + "step": 931, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009115933440625668, + "timestamp": "2025-09-04 03:53:22.526648", + "step": 932, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:22.626849", + "step": 932, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01854240521788597, + "timestamp": "2025-09-04 03:53:22.647644", + "step": 933, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 03:53:22.821929", + "step": 933, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015260078944265842, + "timestamp": "2025-09-04 03:53:22.854322", + "step": 934, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:22.961777", + "step": 934, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009783388115465641, + "timestamp": "2025-09-04 03:53:22.981491", + "step": 935, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:23.090151", + "step": 935, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01038886234164238, + "timestamp": "2025-09-04 03:53:23.110577", + "step": 936, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:23.199339", + "step": 936, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04066663980484009, + "timestamp": "2025-09-04 03:53:23.217678", + "step": 937, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:23.324197", + "step": 937, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015022533014416695, + "timestamp": "2025-09-04 03:53:23.343914", + "step": 938, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:53:23.429503", + "step": 938, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019334964454174042, + "timestamp": "2025-09-04 03:53:23.444805", + "step": 939, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:23.551488", + "step": 939, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027674207463860512, + "timestamp": "2025-09-04 03:53:23.572051", + "step": 940, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:53:31.943440", + "step": 940, + "epoch": 1 + }, + { + "type": "pplx", + "content": 324.7636312669318, + "timestamp": "2025-09-04 03:53:31.945581", + "step": 940, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:32.047643", + "step": 940, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004062741529196501, + "timestamp": "2025-09-04 03:53:32.069548", + "step": 941, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:53:32.273285", + "step": 941, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11786609143018723, + "timestamp": "2025-09-04 03:53:32.312283", + "step": 942, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:53:32.390789", + "step": 942, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012708038091659546, + "timestamp": "2025-09-04 03:53:32.404914", + "step": 943, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:53:32.523185", + "step": 943, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05294226482510567, + "timestamp": "2025-09-04 03:53:32.546081", + "step": 944, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:53:32.652441", + "step": 944, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06667114049196243, + "timestamp": "2025-09-04 03:53:32.674610", + "step": 945, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:32.765158", + "step": 945, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017390718683600426, + "timestamp": "2025-09-04 03:53:32.781730", + "step": 946, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:53:32.866576", + "step": 946, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003352563362568617, + "timestamp": "2025-09-04 03:53:32.881804", + "step": 947, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:32.984148", + "step": 947, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.059882860630750656, + "timestamp": "2025-09-04 03:53:33.003916", + "step": 948, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:53:33.096962", + "step": 948, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04028277471661568, + "timestamp": "2025-09-04 03:53:33.115970", + "step": 949, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 03:53:33.290357", + "step": 949, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018983660265803337, + "timestamp": "2025-09-04 03:53:33.324730", + "step": 950, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:33.418146", + "step": 950, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01843661069869995, + "timestamp": "2025-09-04 03:53:33.435067", + "step": 951, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:53:33.518359", + "step": 951, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029021859169006348, + "timestamp": "2025-09-04 03:53:33.534111", + "step": 952, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:53:33.617540", + "step": 952, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01326957531273365, + "timestamp": "2025-09-04 03:53:33.634605", + "step": 953, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:53:33.713033", + "step": 953, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0053786844946444035, + "timestamp": "2025-09-04 03:53:33.726795", + "step": 954, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:33.829268", + "step": 954, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009815702214837074, + "timestamp": "2025-09-04 03:53:33.848427", + "step": 955, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:53:33.943180", + "step": 955, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012396165169775486, + "timestamp": "2025-09-04 03:53:33.961066", + "step": 956, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:53:34.067090", + "step": 956, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01327445451170206, + "timestamp": "2025-09-04 03:53:34.089193", + "step": 957, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:53:34.292580", + "step": 957, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0052812471985816956, + "timestamp": "2025-09-04 03:53:34.331717", + "step": 958, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:34.436182", + "step": 958, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037195418030023575, + "timestamp": "2025-09-04 03:53:34.455354", + "step": 959, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:53:34.554713", + "step": 959, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0028625449631363153, + "timestamp": "2025-09-04 03:53:34.574149", + "step": 960, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:53:42.954180", + "step": 960, + "epoch": 1 + }, + { + "type": "pplx", + "content": 327.4406282794405, + "timestamp": "2025-09-04 03:53:42.956295", + "step": 960, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 960", + "timestamp": "2025-09-04 03:53:43.310072", + "step": 960, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:53:43.426090", + "step": 960, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11174288392066956, + "timestamp": "2025-09-04 03:53:43.449893", + "step": 961, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:43.552254", + "step": 961, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06079157814383507, + "timestamp": "2025-09-04 03:53:43.571555", + "step": 962, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:43.674623", + "step": 962, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007762270979583263, + "timestamp": "2025-09-04 03:53:43.693819", + "step": 963, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:53:43.788814", + "step": 963, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1044481098651886, + "timestamp": "2025-09-04 03:53:43.807251", + "step": 964, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:53:43.898657", + "step": 964, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04733144864439964, + "timestamp": "2025-09-04 03:53:43.917440", + "step": 965, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:44.018358", + "step": 965, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04005102813243866, + "timestamp": "2025-09-04 03:53:44.037363", + "step": 966, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:44.139306", + "step": 966, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015302048996090889, + "timestamp": "2025-09-04 03:53:44.158514", + "step": 967, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:44.262372", + "step": 967, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04137500002980232, + "timestamp": "2025-09-04 03:53:44.282149", + "step": 968, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:53:44.355720", + "step": 968, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012012657709419727, + "timestamp": "2025-09-04 03:53:44.370202", + "step": 969, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:44.473736", + "step": 969, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0035866987891495228, + "timestamp": "2025-09-04 03:53:44.492727", + "step": 970, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:44.593769", + "step": 970, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011977934278547764, + "timestamp": "2025-09-04 03:53:44.612318", + "step": 971, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:53:44.712118", + "step": 971, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010067245922982693, + "timestamp": "2025-09-04 03:53:44.731252", + "step": 972, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:44.819828", + "step": 972, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.025547686964273453, + "timestamp": "2025-09-04 03:53:44.837978", + "step": 973, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:53:44.915441", + "step": 973, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037151534110307693, + "timestamp": "2025-09-04 03:53:44.929215", + "step": 974, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 03:53:45.143074", + "step": 974, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027699746191501617, + "timestamp": "2025-09-04 03:53:45.184042", + "step": 975, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:53:45.279228", + "step": 975, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05770542845129967, + "timestamp": "2025-09-04 03:53:45.297565", + "step": 976, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:53:45.373416", + "step": 976, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010280147194862366, + "timestamp": "2025-09-04 03:53:45.388732", + "step": 977, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:45.491256", + "step": 977, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016326548531651497, + "timestamp": "2025-09-04 03:53:45.510479", + "step": 978, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 03:53:45.637463", + "step": 978, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0350724533200264, + "timestamp": "2025-09-04 03:53:45.661885", + "step": 979, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:45.765390", + "step": 979, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012970902025699615, + "timestamp": "2025-09-04 03:53:45.785448", + "step": 980, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:53:54.156759", + "step": 980, + "epoch": 1 + }, + { + "type": "pplx", + "content": 326.69142554401174, + "timestamp": "2025-09-04 03:53:54.158794", + "step": 980, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:53:54.233062", + "step": 980, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008149470202624798, + "timestamp": "2025-09-04 03:53:54.248341", + "step": 981, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:53:54.351582", + "step": 981, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04477942734956741, + "timestamp": "2025-09-04 03:53:54.370852", + "step": 982, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:54.471887", + "step": 982, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04230527952313423, + "timestamp": "2025-09-04 03:53:54.490758", + "step": 983, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:53:54.569763", + "step": 983, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021094702184200287, + "timestamp": "2025-09-04 03:53:54.584677", + "step": 984, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:54.681680", + "step": 984, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.050900258123874664, + "timestamp": "2025-09-04 03:53:54.702364", + "step": 985, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:53:54.810576", + "step": 985, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014919820241630077, + "timestamp": "2025-09-04 03:53:54.830622", + "step": 986, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:54.921462", + "step": 986, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0671142116189003, + "timestamp": "2025-09-04 03:53:54.938186", + "step": 987, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:53:55.024616", + "step": 987, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04901759326457977, + "timestamp": "2025-09-04 03:53:55.040769", + "step": 988, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:53:55.129148", + "step": 988, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02041156031191349, + "timestamp": "2025-09-04 03:53:55.147314", + "step": 989, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:53:55.222313", + "step": 989, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03128139674663544, + "timestamp": "2025-09-04 03:53:55.236053", + "step": 990, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:55.328730", + "step": 990, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.1105596199631691, + "timestamp": "2025-09-04 03:53:55.345640", + "step": 991, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:55.446529", + "step": 991, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.2516331374645233, + "timestamp": "2025-09-04 03:53:55.465931", + "step": 992, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:53:55.565425", + "step": 992, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006421744357794523, + "timestamp": "2025-09-04 03:53:55.586207", + "step": 993, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:55.678970", + "step": 993, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.042763665318489075, + "timestamp": "2025-09-04 03:53:55.695888", + "step": 994, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:55.797442", + "step": 994, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004555892664939165, + "timestamp": "2025-09-04 03:53:55.816067", + "step": 995, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:53:55.922577", + "step": 995, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04880642145872116, + "timestamp": "2025-09-04 03:53:55.943075", + "step": 996, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:56.040785", + "step": 996, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029496213421225548, + "timestamp": "2025-09-04 03:53:56.061263", + "step": 997, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:53:56.154620", + "step": 997, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01086695957928896, + "timestamp": "2025-09-04 03:53:56.171470", + "step": 998, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:53:56.272119", + "step": 998, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03509443625807762, + "timestamp": "2025-09-04 03:53:56.290774", + "step": 999, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:53:56.400883", + "step": 999, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05549605190753937, + "timestamp": "2025-09-04 03:53:56.422038", + "step": 1000, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:54:04.789966", + "step": 1000, + "epoch": 1 + }, + { + "type": "pplx", + "content": 323.93892790465287, + "timestamp": "2025-09-04 03:54:04.792236", + "step": 1000, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1000", + "timestamp": "2025-09-04 03:54:05.144251", + "step": 1000, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:54:05.217721", + "step": 1000, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03098498098552227, + "timestamp": "2025-09-04 03:54:05.232637", + "step": 1001, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:05.333112", + "step": 1001, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018459502607584, + "timestamp": "2025-09-04 03:54:05.352002", + "step": 1002, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:54:05.436559", + "step": 1002, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.045362938195466995, + "timestamp": "2025-09-04 03:54:05.451997", + "step": 1003, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:54:05.546333", + "step": 1003, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013697554357349873, + "timestamp": "2025-09-04 03:54:05.564241", + "step": 1004, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:05.656441", + "step": 1004, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014663312584161758, + "timestamp": "2025-09-04 03:54:05.675346", + "step": 1005, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:05.767072", + "step": 1005, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033350620418787, + "timestamp": "2025-09-04 03:54:05.783605", + "step": 1006, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:05.887104", + "step": 1006, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0400872677564621, + "timestamp": "2025-09-04 03:54:05.906229", + "step": 1007, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:54:06.015408", + "step": 1007, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03101273812353611, + "timestamp": "2025-09-04 03:54:06.036432", + "step": 1008, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:54:06.129001", + "step": 1008, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01906219683587551, + "timestamp": "2025-09-04 03:54:06.147639", + "step": 1009, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:06.250583", + "step": 1009, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015632882714271545, + "timestamp": "2025-09-04 03:54:06.269673", + "step": 1010, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:06.373501", + "step": 1010, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015542350709438324, + "timestamp": "2025-09-04 03:54:06.392552", + "step": 1011, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:06.486372", + "step": 1011, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010193181224167347, + "timestamp": "2025-09-04 03:54:06.504449", + "step": 1012, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:06.604714", + "step": 1012, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009116173721849918, + "timestamp": "2025-09-04 03:54:06.625559", + "step": 1013, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:54:06.734943", + "step": 1013, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011737700551748276, + "timestamp": "2025-09-04 03:54:06.755347", + "step": 1014, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:06.858522", + "step": 1014, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05300810933113098, + "timestamp": "2025-09-04 03:54:06.877568", + "step": 1015, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:54:06.964396", + "step": 1015, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016496986150741577, + "timestamp": "2025-09-04 03:54:06.980756", + "step": 1016, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:54:07.063396", + "step": 1016, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06345777213573456, + "timestamp": "2025-09-04 03:54:07.080148", + "step": 1017, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:07.171364", + "step": 1017, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01955530233681202, + "timestamp": "2025-09-04 03:54:07.188018", + "step": 1018, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:07.288317", + "step": 1018, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009021622128784657, + "timestamp": "2025-09-04 03:54:07.306933", + "step": 1019, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:07.407378", + "step": 1019, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02454635314643383, + "timestamp": "2025-09-04 03:54:07.426759", + "step": 1020, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:54:15.807366", + "step": 1020, + "epoch": 1 + }, + { + "type": "pplx", + "content": 322.27552972591826, + "timestamp": "2025-09-04 03:54:15.809410", + "step": 1020, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:54:15.888783", + "step": 1020, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04511036351323128, + "timestamp": "2025-09-04 03:54:15.905147", + "step": 1021, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:54:16.013227", + "step": 1021, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.12731988728046417, + "timestamp": "2025-09-04 03:54:16.033415", + "step": 1022, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 03:54:16.169915", + "step": 1022, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05131891369819641, + "timestamp": "2025-09-04 03:54:16.195930", + "step": 1023, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:16.297930", + "step": 1023, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020861327648162842, + "timestamp": "2025-09-04 03:54:16.317755", + "step": 1024, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:54:16.420449", + "step": 1024, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05731099098920822, + "timestamp": "2025-09-04 03:54:16.442256", + "step": 1025, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:16.544840", + "step": 1025, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04736688733100891, + "timestamp": "2025-09-04 03:54:16.563934", + "step": 1026, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:16.662732", + "step": 1026, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024726245552301407, + "timestamp": "2025-09-04 03:54:16.681418", + "step": 1027, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:54:16.764441", + "step": 1027, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07053575664758682, + "timestamp": "2025-09-04 03:54:16.780300", + "step": 1028, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:54:16.886324", + "step": 1028, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003608345054090023, + "timestamp": "2025-09-04 03:54:16.908667", + "step": 1029, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:54:17.003170", + "step": 1029, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01059285830706358, + "timestamp": "2025-09-04 03:54:17.020000", + "step": 1030, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:17.120442", + "step": 1030, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026861051097512245, + "timestamp": "2025-09-04 03:54:17.139094", + "step": 1031, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 03:54:17.313090", + "step": 1031, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.012563884258270264, + "timestamp": "2025-09-04 03:54:17.346335", + "step": 1032, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:54:17.453087", + "step": 1032, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008008824661374092, + "timestamp": "2025-09-04 03:54:17.475593", + "step": 1033, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:17.578569", + "step": 1033, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004494689870625734, + "timestamp": "2025-09-04 03:54:17.597833", + "step": 1034, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:54:17.676279", + "step": 1034, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05889247730374336, + "timestamp": "2025-09-04 03:54:17.690066", + "step": 1035, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:17.794363", + "step": 1035, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03641991689801216, + "timestamp": "2025-09-04 03:54:17.814291", + "step": 1036, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:17.915191", + "step": 1036, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015427891165018082, + "timestamp": "2025-09-04 03:54:17.936059", + "step": 1037, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:54:18.047638", + "step": 1037, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04161277413368225, + "timestamp": "2025-09-04 03:54:18.068089", + "step": 1038, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:18.169531", + "step": 1038, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013311103917658329, + "timestamp": "2025-09-04 03:54:18.188142", + "step": 1039, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:18.289767", + "step": 1039, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.056311286985874176, + "timestamp": "2025-09-04 03:54:18.309120", + "step": 1040, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:54:26.699707", + "step": 1040, + "epoch": 1 + }, + { + "type": "pplx", + "content": 325.4783086719936, + "timestamp": "2025-09-04 03:54:26.702081", + "step": 1040, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1040", + "timestamp": "2025-09-04 03:54:27.217343", + "step": 1040, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:54:27.293442", + "step": 1040, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005170788615942001, + "timestamp": "2025-09-04 03:54:27.308800", + "step": 1041, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:27.411906", + "step": 1041, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0431942418217659, + "timestamp": "2025-09-04 03:54:27.430779", + "step": 1042, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 03:54:27.562183", + "step": 1042, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003155430080369115, + "timestamp": "2025-09-04 03:54:27.585655", + "step": 1043, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:27.676253", + "step": 1043, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031002743169665337, + "timestamp": "2025-09-04 03:54:27.693852", + "step": 1044, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1008 + ], + "flops": 20160122450880.0 + }, + "timestamp": "2025-09-04 03:54:27.835708", + "step": 1044, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0061140297912061214, + "timestamp": "2025-09-04 03:54:27.866848", + "step": 1045, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:27.970250", + "step": 1045, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024821752682328224, + "timestamp": "2025-09-04 03:54:27.989454", + "step": 1046, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:54:28.097849", + "step": 1046, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02534925378859043, + "timestamp": "2025-09-04 03:54:28.118164", + "step": 1047, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:28.217521", + "step": 1047, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06829417496919632, + "timestamp": "2025-09-04 03:54:28.236845", + "step": 1048, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:28.337902", + "step": 1048, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011538490653038025, + "timestamp": "2025-09-04 03:54:28.359059", + "step": 1049, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:28.460838", + "step": 1049, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017921442165970802, + "timestamp": "2025-09-04 03:54:28.479791", + "step": 1050, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:54:28.557920", + "step": 1050, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00594189902767539, + "timestamp": "2025-09-04 03:54:28.572056", + "step": 1051, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:28.661977", + "step": 1051, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06706573069095612, + "timestamp": "2025-09-04 03:54:28.679555", + "step": 1052, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:54:28.793603", + "step": 1052, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07539967447519302, + "timestamp": "2025-09-04 03:54:28.817899", + "step": 1053, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:28.911701", + "step": 1053, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01035250723361969, + "timestamp": "2025-09-04 03:54:28.929113", + "step": 1054, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:54:29.003669", + "step": 1054, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024351386353373528, + "timestamp": "2025-09-04 03:54:29.017233", + "step": 1055, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:29.122010", + "step": 1055, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.041539065539836884, + "timestamp": "2025-09-04 03:54:29.141863", + "step": 1056, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:29.238138", + "step": 1056, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03460566699504852, + "timestamp": "2025-09-04 03:54:29.258519", + "step": 1057, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:54:29.393251", + "step": 1057, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005672653205692768, + "timestamp": "2025-09-04 03:54:29.418907", + "step": 1058, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:29.520526", + "step": 1058, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04056652635335922, + "timestamp": "2025-09-04 03:54:29.539428", + "step": 1059, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:54:29.625500", + "step": 1059, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018356602638959885, + "timestamp": "2025-09-04 03:54:29.641923", + "step": 1060, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:54:38.037867", + "step": 1060, + "epoch": 1 + }, + { + "type": "pplx", + "content": 329.5744553045225, + "timestamp": "2025-09-04 03:54:38.039700", + "step": 1060, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:38.137979", + "step": 1060, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018076708540320396, + "timestamp": "2025-09-04 03:54:38.159151", + "step": 1061, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:54:38.238436", + "step": 1061, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008117503486573696, + "timestamp": "2025-09-04 03:54:38.252549", + "step": 1062, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:54:38.330718", + "step": 1062, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.09541762620210648, + "timestamp": "2025-09-04 03:54:38.344655", + "step": 1063, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:54:38.455471", + "step": 1063, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004544029477983713, + "timestamp": "2025-09-04 03:54:38.476657", + "step": 1064, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:54:38.553048", + "step": 1064, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018085738644003868, + "timestamp": "2025-09-04 03:54:38.568555", + "step": 1065, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:38.672167", + "step": 1065, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008873535320162773, + "timestamp": "2025-09-04 03:54:38.691241", + "step": 1066, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:54:38.798973", + "step": 1066, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016320547088980675, + "timestamp": "2025-09-04 03:54:38.818751", + "step": 1067, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:38.923333", + "step": 1067, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04694967344403267, + "timestamp": "2025-09-04 03:54:38.943139", + "step": 1068, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:54:39.051635", + "step": 1068, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05457077920436859, + "timestamp": "2025-09-04 03:54:39.073300", + "step": 1069, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:39.169792", + "step": 1069, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006687942426651716, + "timestamp": "2025-09-04 03:54:39.186448", + "step": 1070, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:39.281245", + "step": 1070, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019025299698114395, + "timestamp": "2025-09-04 03:54:39.298353", + "step": 1071, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:39.390091", + "step": 1071, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020227275788784027, + "timestamp": "2025-09-04 03:54:39.407340", + "step": 1072, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:54:39.500568", + "step": 1072, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05452266335487366, + "timestamp": "2025-09-04 03:54:39.519561", + "step": 1073, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:54:39.604031", + "step": 1073, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027135973796248436, + "timestamp": "2025-09-04 03:54:39.618960", + "step": 1074, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:39.718358", + "step": 1074, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027994517236948013, + "timestamp": "2025-09-04 03:54:39.736747", + "step": 1075, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:54:39.843238", + "step": 1075, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023070676252245903, + "timestamp": "2025-09-04 03:54:39.863118", + "step": 1076, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:39.963035", + "step": 1076, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01172536239027977, + "timestamp": "2025-09-04 03:54:39.983911", + "step": 1077, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:54:40.070577", + "step": 1077, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029984669759869576, + "timestamp": "2025-09-04 03:54:40.086018", + "step": 1078, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:40.179735", + "step": 1078, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024687113240361214, + "timestamp": "2025-09-04 03:54:40.196981", + "step": 1079, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:40.300861", + "step": 1079, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008470497094094753, + "timestamp": "2025-09-04 03:54:40.320739", + "step": 1080, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:54:48.711208", + "step": 1080, + "epoch": 1 + }, + { + "type": "pplx", + "content": 331.8358204811036, + "timestamp": "2025-09-04 03:54:48.713485", + "step": 1080, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1080", + "timestamp": "2025-09-04 03:54:49.226299", + "step": 1080, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:49.323088", + "step": 1080, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08154661953449249, + "timestamp": "2025-09-04 03:54:49.343622", + "step": 1081, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:54:49.448147", + "step": 1081, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028146987780928612, + "timestamp": "2025-09-04 03:54:49.468034", + "step": 1082, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:49.571712", + "step": 1082, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024996791034936905, + "timestamp": "2025-09-04 03:54:49.590885", + "step": 1083, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1424 + ], + "flops": 28480172958272.0 + }, + "timestamp": "2025-09-04 03:54:49.802309", + "step": 1083, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03116009198129177, + "timestamp": "2025-09-04 03:54:49.843613", + "step": 1084, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:49.941437", + "step": 1084, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032337624579668045, + "timestamp": "2025-09-04 03:54:49.961390", + "step": 1085, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:54:50.058815", + "step": 1085, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07175617665052414, + "timestamp": "2025-09-04 03:54:50.075368", + "step": 1086, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:50.169947", + "step": 1086, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0184723399579525, + "timestamp": "2025-09-04 03:54:50.186963", + "step": 1087, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:50.291628", + "step": 1087, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03585413470864296, + "timestamp": "2025-09-04 03:54:50.311636", + "step": 1088, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:54:50.392897", + "step": 1088, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03618989884853363, + "timestamp": "2025-09-04 03:54:50.409370", + "step": 1089, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:54:50.518834", + "step": 1089, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05040483921766281, + "timestamp": "2025-09-04 03:54:50.538855", + "step": 1090, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:54:50.639859", + "step": 1090, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05431760847568512, + "timestamp": "2025-09-04 03:54:50.658560", + "step": 1091, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:54:50.753968", + "step": 1091, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10624273121356964, + "timestamp": "2025-09-04 03:54:50.772035", + "step": 1092, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:54:50.844993", + "step": 1092, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03237966448068619, + "timestamp": "2025-09-04 03:54:50.859614", + "step": 1093, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:54:50.958245", + "step": 1093, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04587221145629883, + "timestamp": "2025-09-04 03:54:50.976639", + "step": 1094, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:54:51.094402", + "step": 1094, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028815526515245438, + "timestamp": "2025-09-04 03:54:51.116563", + "step": 1095, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:54:51.220135", + "step": 1095, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013644875027239323, + "timestamp": "2025-09-04 03:54:51.239889", + "step": 1096, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 03:54:51.384380", + "step": 1096, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03480622544884682, + "timestamp": "2025-09-04 03:54:51.415384", + "step": 1097, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:54:51.523257", + "step": 1097, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03265468776226044, + "timestamp": "2025-09-04 03:54:51.543382", + "step": 1098, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:54:51.638835", + "step": 1098, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003582942998036742, + "timestamp": "2025-09-04 03:54:51.656106", + "step": 1099, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:54:51.749819", + "step": 1099, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.058865927159786224, + "timestamp": "2025-09-04 03:54:51.767754", + "step": 1100, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:00.169117", + "step": 1100, + "epoch": 1 + }, + { + "type": "pplx", + "content": 330.568579902323, + "timestamp": "2025-09-04 03:55:00.171543", + "step": 1100, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:00.268289", + "step": 1100, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06865076720714569, + "timestamp": "2025-09-04 03:55:00.288903", + "step": 1101, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:00.390269", + "step": 1101, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10805238038301468, + "timestamp": "2025-09-04 03:55:00.408882", + "step": 1102, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:55:00.484730", + "step": 1102, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01404650043696165, + "timestamp": "2025-09-04 03:55:00.498048", + "step": 1103, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:55:00.590116", + "step": 1103, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06355747580528259, + "timestamp": "2025-09-04 03:55:00.607459", + "step": 1104, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:55:00.714230", + "step": 1104, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02257397770881653, + "timestamp": "2025-09-04 03:55:00.736553", + "step": 1105, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:00.835157", + "step": 1105, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03384973481297493, + "timestamp": "2025-09-04 03:55:00.852475", + "step": 1106, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:00.956418", + "step": 1106, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.001299434108659625, + "timestamp": "2025-09-04 03:55:00.975594", + "step": 1107, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:55:01.051270", + "step": 1107, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02580624632537365, + "timestamp": "2025-09-04 03:55:01.065411", + "step": 1108, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:55:01.157686", + "step": 1108, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04326159134507179, + "timestamp": "2025-09-04 03:55:01.176243", + "step": 1109, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:01.277634", + "step": 1109, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05120621249079704, + "timestamp": "2025-09-04 03:55:01.296313", + "step": 1110, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:01.399016", + "step": 1110, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0068900627084076405, + "timestamp": "2025-09-04 03:55:01.417975", + "step": 1111, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:01.528856", + "step": 1111, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018264418467879295, + "timestamp": "2025-09-04 03:55:01.550027", + "step": 1112, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:55:01.630695", + "step": 1112, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022158373147249222, + "timestamp": "2025-09-04 03:55:01.647141", + "step": 1113, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 03:55:01.818413", + "step": 1113, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.027837276458740234, + "timestamp": "2025-09-04 03:55:01.853035", + "step": 1114, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:01.955510", + "step": 1114, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.044819965958595276, + "timestamp": "2025-09-04 03:55:01.974485", + "step": 1115, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:02.076943", + "step": 1115, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040360935032367706, + "timestamp": "2025-09-04 03:55:02.096612", + "step": 1116, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:02.197548", + "step": 1116, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030223630368709564, + "timestamp": "2025-09-04 03:55:02.218582", + "step": 1117, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:55:02.293568", + "step": 1117, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04847509413957596, + "timestamp": "2025-09-04 03:55:02.306906", + "step": 1118, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:02.401949", + "step": 1118, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0198849868029356, + "timestamp": "2025-09-04 03:55:02.419194", + "step": 1119, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:02.504757", + "step": 1119, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0160811897367239, + "timestamp": "2025-09-04 03:55:02.520772", + "step": 1120, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:10.996486", + "step": 1120, + "epoch": 1 + }, + { + "type": "pplx", + "content": 333.6826089296669, + "timestamp": "2025-09-04 03:55:10.998830", + "step": 1120, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1120", + "timestamp": "2025-09-04 03:55:11.503767", + "step": 1120, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:55:11.577559", + "step": 1120, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.047653671354055405, + "timestamp": "2025-09-04 03:55:11.592267", + "step": 1121, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:55:11.688189", + "step": 1121, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02942013181746006, + "timestamp": "2025-09-04 03:55:11.705321", + "step": 1122, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:11.811599", + "step": 1122, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04028499498963356, + "timestamp": "2025-09-04 03:55:11.830641", + "step": 1123, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:11.919412", + "step": 1123, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029223607853055, + "timestamp": "2025-09-04 03:55:11.935618", + "step": 1124, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1232 + ], + "flops": 24640149647168.0 + }, + "timestamp": "2025-09-04 03:55:12.117352", + "step": 1124, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.100711390376091, + "timestamp": "2025-09-04 03:55:12.154447", + "step": 1125, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:12.241477", + "step": 1125, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009318535216152668, + "timestamp": "2025-09-04 03:55:12.256752", + "step": 1126, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:12.361415", + "step": 1126, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.049649372696876526, + "timestamp": "2025-09-04 03:55:12.380493", + "step": 1127, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:12.491512", + "step": 1127, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015808911994099617, + "timestamp": "2025-09-04 03:55:12.510709", + "step": 1128, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 03:55:12.646841", + "step": 1128, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010250390507280827, + "timestamp": "2025-09-04 03:55:12.675085", + "step": 1129, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:12.763963", + "step": 1129, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0313245952129364, + "timestamp": "2025-09-04 03:55:12.779365", + "step": 1130, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:55:12.893061", + "step": 1130, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.10398827493190765, + "timestamp": "2025-09-04 03:55:12.912994", + "step": 1131, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:13.017369", + "step": 1131, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03392321988940239, + "timestamp": "2025-09-04 03:55:13.036669", + "step": 1132, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:55:13.126425", + "step": 1132, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008513440378010273, + "timestamp": "2025-09-04 03:55:13.144531", + "step": 1133, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:13.247351", + "step": 1133, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0382850281894207, + "timestamp": "2025-09-04 03:55:13.265558", + "step": 1134, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:55:13.344506", + "step": 1134, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07204016298055649, + "timestamp": "2025-09-04 03:55:13.358254", + "step": 1135, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:13.444419", + "step": 1135, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014276178553700447, + "timestamp": "2025-09-04 03:55:13.460401", + "step": 1136, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:13.561239", + "step": 1136, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0037956838496029377, + "timestamp": "2025-09-04 03:55:13.580737", + "step": 1137, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:13.689193", + "step": 1137, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005481324158608913, + "timestamp": "2025-09-04 03:55:13.707893", + "step": 1138, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:13.809507", + "step": 1138, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0013551759766414762, + "timestamp": "2025-09-04 03:55:13.827493", + "step": 1139, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:55:13.967598", + "step": 1139, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0756765827536583, + "timestamp": "2025-09-04 03:55:13.994045", + "step": 1140, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:22.505779", + "step": 1140, + "epoch": 1 + }, + { + "type": "pplx", + "content": 342.1746491135325, + "timestamp": "2025-09-04 03:55:22.508181", + "step": 1140, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:55:22.625590", + "step": 1140, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01116140466183424, + "timestamp": "2025-09-04 03:55:22.651011", + "step": 1141, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:55:22.855245", + "step": 1141, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031512316316366196, + "timestamp": "2025-09-04 03:55:22.894487", + "step": 1142, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:23.005275", + "step": 1142, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010817242786288261, + "timestamp": "2025-09-04 03:55:23.025885", + "step": 1143, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:23.122786", + "step": 1143, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011584990657866001, + "timestamp": "2025-09-04 03:55:23.141003", + "step": 1144, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:55:23.232853", + "step": 1144, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01660754159092903, + "timestamp": "2025-09-04 03:55:23.251496", + "step": 1145, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:55:23.362256", + "step": 1145, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.042043667286634445, + "timestamp": "2025-09-04 03:55:23.382433", + "step": 1146, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:23.491895", + "step": 1146, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04430728405714035, + "timestamp": "2025-09-04 03:55:23.512635", + "step": 1147, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:23.616816", + "step": 1147, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04766961559653282, + "timestamp": "2025-09-04 03:55:23.636716", + "step": 1148, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:23.729421", + "step": 1148, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02647322788834572, + "timestamp": "2025-09-04 03:55:23.748507", + "step": 1149, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:23.833529", + "step": 1149, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008523870259523392, + "timestamp": "2025-09-04 03:55:23.849020", + "step": 1150, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:55:23.939069", + "step": 1150, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03426166996359825, + "timestamp": "2025-09-04 03:55:23.955715", + "step": 1151, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:24.040437", + "step": 1151, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0056257485412061214, + "timestamp": "2025-09-04 03:55:24.056529", + "step": 1152, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:55:24.148754", + "step": 1152, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03579988703131676, + "timestamp": "2025-09-04 03:55:24.167610", + "step": 1153, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:24.270568", + "step": 1153, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021155666559934616, + "timestamp": "2025-09-04 03:55:24.289678", + "step": 1154, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:24.400847", + "step": 1154, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.020209012553095818, + "timestamp": "2025-09-04 03:55:24.421226", + "step": 1155, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:24.530289", + "step": 1155, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08458837121725082, + "timestamp": "2025-09-04 03:55:24.550087", + "step": 1156, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:24.632987", + "step": 1156, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04702272638678551, + "timestamp": "2025-09-04 03:55:24.649804", + "step": 1157, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:55:24.772244", + "step": 1157, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03629223629832268, + "timestamp": "2025-09-04 03:55:24.795123", + "step": 1158, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:24.906181", + "step": 1158, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010823562741279602, + "timestamp": "2025-09-04 03:55:24.926963", + "step": 1159, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:25.026738", + "step": 1159, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.015217112377285957, + "timestamp": "2025-09-04 03:55:25.045911", + "step": 1160, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:33.427667", + "step": 1160, + "epoch": 1 + }, + { + "type": "pplx", + "content": 348.55127656576644, + "timestamp": "2025-09-04 03:55:33.429586", + "step": 1160, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1160", + "timestamp": "2025-09-04 03:55:33.897323", + "step": 1160, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:55:34.003968", + "step": 1160, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05122144892811775, + "timestamp": "2025-09-04 03:55:34.026479", + "step": 1161, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:34.128682", + "step": 1161, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02695164829492569, + "timestamp": "2025-09-04 03:55:34.147589", + "step": 1162, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:55:34.226537", + "step": 1162, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07939313352108002, + "timestamp": "2025-09-04 03:55:34.240538", + "step": 1163, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:34.347710", + "step": 1163, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.032340407371520996, + "timestamp": "2025-09-04 03:55:34.367546", + "step": 1164, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:34.468983", + "step": 1164, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04987955838441849, + "timestamp": "2025-09-04 03:55:34.489115", + "step": 1165, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:55:34.582559", + "step": 1165, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005085882265120745, + "timestamp": "2025-09-04 03:55:34.599866", + "step": 1166, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 03:55:34.745576", + "step": 1166, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028625115752220154, + "timestamp": "2025-09-04 03:55:34.773627", + "step": 1167, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 03:55:34.844590", + "step": 1167, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04103225842118263, + "timestamp": "2025-09-04 03:55:34.857925", + "step": 1168, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:34.960699", + "step": 1168, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029832925647497177, + "timestamp": "2025-09-04 03:55:34.981572", + "step": 1169, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:55:35.091917", + "step": 1169, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006762138567864895, + "timestamp": "2025-09-04 03:55:35.112008", + "step": 1170, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 03:55:35.274883", + "step": 1170, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04091715067625046, + "timestamp": "2025-09-04 03:55:35.306668", + "step": 1171, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:55:35.390858", + "step": 1171, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07869014889001846, + "timestamp": "2025-09-04 03:55:35.406503", + "step": 1172, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:35.507835", + "step": 1172, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.07957983762025833, + "timestamp": "2025-09-04 03:55:35.528848", + "step": 1173, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 03:55:35.598923", + "step": 1173, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00997950043529272, + "timestamp": "2025-09-04 03:55:35.611323", + "step": 1174, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:35.722162", + "step": 1174, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04756058380007744, + "timestamp": "2025-09-04 03:55:35.742478", + "step": 1175, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:35.829004", + "step": 1175, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.046221889555454254, + "timestamp": "2025-09-04 03:55:35.845178", + "step": 1176, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:35.946113", + "step": 1176, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030887359753251076, + "timestamp": "2025-09-04 03:55:35.966861", + "step": 1177, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:36.067236", + "step": 1177, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03892279416322708, + "timestamp": "2025-09-04 03:55:36.085561", + "step": 1178, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:55:36.164518", + "step": 1178, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007792294956743717, + "timestamp": "2025-09-04 03:55:36.178652", + "step": 1179, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:55:36.288591", + "step": 1179, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05254804715514183, + "timestamp": "2025-09-04 03:55:36.309426", + "step": 1180, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:44.707890", + "step": 1180, + "epoch": 1 + }, + { + "type": "pplx", + "content": 353.37533564087937, + "timestamp": "2025-09-04 03:55:44.709798", + "step": 1180, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:44.791751", + "step": 1180, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028075462207198143, + "timestamp": "2025-09-04 03:55:44.808892", + "step": 1181, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:44.904297", + "step": 1181, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.062234699726104736, + "timestamp": "2025-09-04 03:55:44.921666", + "step": 1182, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:55:45.029947", + "step": 1182, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05085289850831032, + "timestamp": "2025-09-04 03:55:45.050048", + "step": 1183, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:55:45.126108", + "step": 1183, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03087581880390644, + "timestamp": "2025-09-04 03:55:45.140649", + "step": 1184, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:55:45.243704", + "step": 1184, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008511030115187168, + "timestamp": "2025-09-04 03:55:45.265511", + "step": 1185, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:45.377248", + "step": 1185, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06436464190483093, + "timestamp": "2025-09-04 03:55:45.397541", + "step": 1186, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:55:45.493257", + "step": 1186, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01800878904759884, + "timestamp": "2025-09-04 03:55:45.510182", + "step": 1187, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:45.611829", + "step": 1187, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004938524682074785, + "timestamp": "2025-09-04 03:55:45.631169", + "step": 1188, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1088 + ], + "flops": 21760132163840.0 + }, + "timestamp": "2025-09-04 03:55:45.785314", + "step": 1188, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03547127917408943, + "timestamp": "2025-09-04 03:55:45.818526", + "step": 1189, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:45.923937", + "step": 1189, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04392065852880478, + "timestamp": "2025-09-04 03:55:45.943073", + "step": 1190, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:55:46.044602", + "step": 1190, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005546086002141237, + "timestamp": "2025-09-04 03:55:46.061763", + "step": 1191, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:55:46.173039", + "step": 1191, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019072137773036957, + "timestamp": "2025-09-04 03:55:46.193789", + "step": 1192, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:55:46.285071", + "step": 1192, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.018132086843252182, + "timestamp": "2025-09-04 03:55:46.301809", + "step": 1193, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:46.392004", + "step": 1193, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04924427345395088, + "timestamp": "2025-09-04 03:55:46.407467", + "step": 1194, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:46.510364", + "step": 1194, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03797117620706558, + "timestamp": "2025-09-04 03:55:46.528840", + "step": 1195, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:55:46.627211", + "step": 1195, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.040146905928850174, + "timestamp": "2025-09-04 03:55:46.644773", + "step": 1196, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:46.746916", + "step": 1196, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03447960317134857, + "timestamp": "2025-09-04 03:55:46.767915", + "step": 1197, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:55:46.890588", + "step": 1197, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.019626695662736893, + "timestamp": "2025-09-04 03:55:46.911658", + "step": 1198, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:55:47.004148", + "step": 1198, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00873672403395176, + "timestamp": "2025-09-04 03:55:47.020696", + "step": 1199, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:55:47.128916", + "step": 1199, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06406814604997635, + "timestamp": "2025-09-04 03:55:47.146364", + "step": 1200, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:55:55.666983", + "step": 1200, + "epoch": 1 + }, + { + "type": "pplx", + "content": 351.81034139828853, + "timestamp": "2025-09-04 03:55:55.668968", + "step": 1200, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1200", + "timestamp": "2025-09-04 03:55:56.192955", + "step": 1200, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 03:55:56.257064", + "step": 1200, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.00952016282826662, + "timestamp": "2025-09-04 03:55:56.268026", + "step": 1201, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:55:56.351897", + "step": 1201, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014243190176784992, + "timestamp": "2025-09-04 03:55:56.366624", + "step": 1202, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 03:55:56.497096", + "step": 1202, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05320248380303383, + "timestamp": "2025-09-04 03:55:56.520822", + "step": 1203, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:56.610589", + "step": 1203, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05997435748577118, + "timestamp": "2025-09-04 03:55:56.626370", + "step": 1204, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:56.741582", + "step": 1204, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.037900906056165695, + "timestamp": "2025-09-04 03:55:56.763717", + "step": 1205, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:55:56.874415", + "step": 1205, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06352101266384125, + "timestamp": "2025-09-04 03:55:56.894641", + "step": 1206, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:57.001142", + "step": 1206, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04056302830576897, + "timestamp": "2025-09-04 03:55:57.020228", + "step": 1207, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:57.123990", + "step": 1207, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02337026037275791, + "timestamp": "2025-09-04 03:55:57.143031", + "step": 1208, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:55:57.222919", + "step": 1208, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.022142881527543068, + "timestamp": "2025-09-04 03:55:57.237961", + "step": 1209, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:55:57.353958", + "step": 1209, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.008503721095621586, + "timestamp": "2025-09-04 03:55:57.374604", + "step": 1210, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:55:57.481228", + "step": 1210, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024838682264089584, + "timestamp": "2025-09-04 03:55:57.500077", + "step": 1211, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:57.607060", + "step": 1211, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.016720108687877655, + "timestamp": "2025-09-04 03:55:57.626884", + "step": 1212, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:55:57.716274", + "step": 1212, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03313479945063591, + "timestamp": "2025-09-04 03:55:57.734459", + "step": 1213, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:57.839337", + "step": 1213, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03852255269885063, + "timestamp": "2025-09-04 03:55:57.858178", + "step": 1214, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:57.959198", + "step": 1214, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030346812680363655, + "timestamp": "2025-09-04 03:55:57.978050", + "step": 1215, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:55:58.080948", + "step": 1215, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013426399789750576, + "timestamp": "2025-09-04 03:55:58.100720", + "step": 1216, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:55:58.219568", + "step": 1216, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007183226756751537, + "timestamp": "2025-09-04 03:55:58.244859", + "step": 1217, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:55:58.331516", + "step": 1217, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.024007325991988182, + "timestamp": "2025-09-04 03:55:58.346913", + "step": 1218, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:55:58.446232", + "step": 1218, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01614953577518463, + "timestamp": "2025-09-04 03:55:58.464742", + "step": 1219, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:55:58.566439", + "step": 1219, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05485903471708298, + "timestamp": "2025-09-04 03:55:58.585891", + "step": 1220, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:56:07.237314", + "step": 1220, + "epoch": 1 + }, + { + "type": "pplx", + "content": 343.1063356120184, + "timestamp": "2025-09-04 03:56:07.239254", + "step": 1220, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:07.337811", + "step": 1220, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009709849022328854, + "timestamp": "2025-09-04 03:56:07.358941", + "step": 1221, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:07.453863", + "step": 1221, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0713476687669754, + "timestamp": "2025-09-04 03:56:07.471290", + "step": 1222, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:07.574956", + "step": 1222, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014135504141449928, + "timestamp": "2025-09-04 03:56:07.594208", + "step": 1223, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:07.694439", + "step": 1223, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08039960265159607, + "timestamp": "2025-09-04 03:56:07.713550", + "step": 1224, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:07.811723", + "step": 1224, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006991466507315636, + "timestamp": "2025-09-04 03:56:07.832184", + "step": 1225, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:07.935653", + "step": 1225, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.003120355773717165, + "timestamp": "2025-09-04 03:56:07.954778", + "step": 1226, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:08.063069", + "step": 1226, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03354727104306221, + "timestamp": "2025-09-04 03:56:08.083308", + "step": 1227, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:56:08.199707", + "step": 1227, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.004579135682433844, + "timestamp": "2025-09-04 03:56:08.222359", + "step": 1228, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:08.315258", + "step": 1228, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010765165090560913, + "timestamp": "2025-09-04 03:56:08.334373", + "step": 1229, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:56:08.413365", + "step": 1229, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03472265228629112, + "timestamp": "2025-09-04 03:56:08.427468", + "step": 1230, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:56:08.532931", + "step": 1230, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.028265012428164482, + "timestamp": "2025-09-04 03:56:08.552694", + "step": 1231, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:08.668961", + "step": 1231, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06229390949010849, + "timestamp": "2025-09-04 03:56:08.688691", + "step": 1232, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:56:08.771497", + "step": 1232, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.013525002636015415, + "timestamp": "2025-09-04 03:56:08.787962", + "step": 1233, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:56:08.905323", + "step": 1233, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.011920424178242683, + "timestamp": "2025-09-04 03:56:08.927293", + "step": 1234, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:56:09.003578", + "step": 1234, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01954522728919983, + "timestamp": "2025-09-04 03:56:09.017136", + "step": 1235, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:56:09.124546", + "step": 1235, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.002615840407088399, + "timestamp": "2025-09-04 03:56:09.145091", + "step": 1236, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:56:09.253716", + "step": 1236, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.051942598074674606, + "timestamp": "2025-09-04 03:56:09.276397", + "step": 1237, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:09.370402", + "step": 1237, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.006919113453477621, + "timestamp": "2025-09-04 03:56:09.387324", + "step": 1238, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:56:09.472227", + "step": 1238, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.11789951473474503, + "timestamp": "2025-09-04 03:56:09.487161", + "step": 1239, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:09.580412", + "step": 1239, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031079446896910667, + "timestamp": "2025-09-04 03:56:09.598105", + "step": 1240, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:56:17.965019", + "step": 1240, + "epoch": 1 + }, + { + "type": "pplx", + "content": 339.34815384838885, + "timestamp": "2025-09-04 03:56:17.966998", + "step": 1240, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1240", + "timestamp": "2025-09-04 03:56:18.478463", + "step": 1240, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:56:18.583380", + "step": 1240, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.046426158398389816, + "timestamp": "2025-09-04 03:56:18.605824", + "step": 1241, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:56:18.716749", + "step": 1241, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009461128152906895, + "timestamp": "2025-09-04 03:56:18.737124", + "step": 1242, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:18.836445", + "step": 1242, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02478550747036934, + "timestamp": "2025-09-04 03:56:18.853794", + "step": 1243, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 03:56:18.924573", + "step": 1243, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021543040871620178, + "timestamp": "2025-09-04 03:56:18.937862", + "step": 1244, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:56:19.022085", + "step": 1244, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0028493471909314394, + "timestamp": "2025-09-04 03:56:19.038702", + "step": 1245, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:19.147377", + "step": 1245, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03405190631747246, + "timestamp": "2025-09-04 03:56:19.165832", + "step": 1246, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:19.266960", + "step": 1246, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04478135332465172, + "timestamp": "2025-09-04 03:56:19.285942", + "step": 1247, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:19.388388", + "step": 1247, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06943120062351227, + "timestamp": "2025-09-04 03:56:19.408022", + "step": 1248, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:19.508182", + "step": 1248, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01673070713877678, + "timestamp": "2025-09-04 03:56:19.528349", + "step": 1249, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:56:19.635364", + "step": 1249, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04285313934087753, + "timestamp": "2025-09-04 03:56:19.655112", + "step": 1250, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:19.758825", + "step": 1250, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007856685668230057, + "timestamp": "2025-09-04 03:56:19.777910", + "step": 1251, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:56:19.852320", + "step": 1251, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.033521927893161774, + "timestamp": "2025-09-04 03:56:19.866604", + "step": 1252, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:19.958870", + "step": 1252, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05988616868853569, + "timestamp": "2025-09-04 03:56:19.977743", + "step": 1253, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:56:20.083349", + "step": 1253, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05120409280061722, + "timestamp": "2025-09-04 03:56:20.103094", + "step": 1254, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:56:20.188253", + "step": 1254, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03182673081755638, + "timestamp": "2025-09-04 03:56:20.203436", + "step": 1255, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:20.296178", + "step": 1255, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03344005346298218, + "timestamp": "2025-09-04 03:56:20.313868", + "step": 1256, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:56:20.402001", + "step": 1256, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0048216646537184715, + "timestamp": "2025-09-04 03:56:20.420176", + "step": 1257, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:56:20.501654", + "step": 1257, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04110339656472206, + "timestamp": "2025-09-04 03:56:20.515279", + "step": 1258, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:20.616816", + "step": 1258, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0046073743142187595, + "timestamp": "2025-09-04 03:56:20.635877", + "step": 1259, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:20.738003", + "step": 1259, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.009192178025841713, + "timestamp": "2025-09-04 03:56:20.757753", + "step": 1260, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:56:29.142086", + "step": 1260, + "epoch": 1 + }, + { + "type": "pplx", + "content": 340.13528680803216, + "timestamp": "2025-09-04 03:56:29.144161", + "step": 1260, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:56:29.223599", + "step": 1260, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03510812297463417, + "timestamp": "2025-09-04 03:56:29.240057", + "step": 1261, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:29.348362", + "step": 1261, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04541385546326637, + "timestamp": "2025-09-04 03:56:29.368470", + "step": 1262, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:29.470427", + "step": 1262, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.021681929007172585, + "timestamp": "2025-09-04 03:56:29.489312", + "step": 1263, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:29.582465", + "step": 1263, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014316936954855919, + "timestamp": "2025-09-04 03:56:29.600152", + "step": 1264, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:29.704522", + "step": 1264, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023813286796212196, + "timestamp": "2025-09-04 03:56:29.726481", + "step": 1265, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:56:29.837560", + "step": 1265, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06364905834197998, + "timestamp": "2025-09-04 03:56:29.857947", + "step": 1266, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 03:56:30.003616", + "step": 1266, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02237660065293312, + "timestamp": "2025-09-04 03:56:30.031688", + "step": 1267, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:30.132118", + "step": 1267, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.050466664135456085, + "timestamp": "2025-09-04 03:56:30.151541", + "step": 1268, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:30.241954", + "step": 1268, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01618255488574505, + "timestamp": "2025-09-04 03:56:30.260597", + "step": 1269, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:56:30.344420", + "step": 1269, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.16435541212558746, + "timestamp": "2025-09-04 03:56:30.359316", + "step": 1270, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:30.461700", + "step": 1270, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03205430880188942, + "timestamp": "2025-09-04 03:56:30.480832", + "step": 1271, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:56:30.558405", + "step": 1271, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06162261217832565, + "timestamp": "2025-09-04 03:56:30.572763", + "step": 1272, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:56:30.656850", + "step": 1272, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.023222552612423897, + "timestamp": "2025-09-04 03:56:30.673770", + "step": 1273, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:30.781833", + "step": 1273, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06229608505964279, + "timestamp": "2025-09-04 03:56:30.801926", + "step": 1274, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:56:30.910768", + "step": 1274, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.0057587032206356525, + "timestamp": "2025-09-04 03:56:30.931221", + "step": 1275, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:31.022716", + "step": 1275, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026004638522863388, + "timestamp": "2025-09-04 03:56:31.040502", + "step": 1276, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:31.130737", + "step": 1276, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03231319040060043, + "timestamp": "2025-09-04 03:56:31.149923", + "step": 1277, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:31.251356", + "step": 1277, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.02814597263932228, + "timestamp": "2025-09-04 03:56:31.270274", + "step": 1278, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:56:31.344681", + "step": 1278, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04910106584429741, + "timestamp": "2025-09-04 03:56:31.358005", + "step": 1279, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:56:31.450944", + "step": 1279, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.05825873091816902, + "timestamp": "2025-09-04 03:56:31.467043", + "step": 1280, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:56:39.844352", + "step": 1280, + "epoch": 1 + }, + { + "type": "pplx", + "content": 340.68138853975984, + "timestamp": "2025-09-04 03:56:39.846112", + "step": 1280, + "epoch": 1 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1280", + "timestamp": "2025-09-04 03:56:40.198819", + "step": 1280, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:40.296797", + "step": 1280, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.010946203954517841, + "timestamp": "2025-09-04 03:56:40.317552", + "step": 1281, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:56:40.402157", + "step": 1281, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01738363690674305, + "timestamp": "2025-09-04 03:56:40.417412", + "step": 1282, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:40.511099", + "step": 1282, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026985451579093933, + "timestamp": "2025-09-04 03:56:40.528410", + "step": 1283, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:40.621743", + "step": 1283, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005689225625246763, + "timestamp": "2025-09-04 03:56:40.639547", + "step": 1284, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:40.730975", + "step": 1284, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.03335564583539963, + "timestamp": "2025-09-04 03:56:40.749866", + "step": 1285, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:40.850254", + "step": 1285, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.014564265497028828, + "timestamp": "2025-09-04 03:56:40.868661", + "step": 1286, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:40.967605", + "step": 1286, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04401400312781334, + "timestamp": "2025-09-04 03:56:40.985931", + "step": 1287, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:41.081033", + "step": 1287, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.031287554651498795, + "timestamp": "2025-09-04 03:56:41.099129", + "step": 1288, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:56:41.208052", + "step": 1288, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01182449609041214, + "timestamp": "2025-09-04 03:56:41.230594", + "step": 1289, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:56:41.314338", + "step": 1289, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.08080806583166122, + "timestamp": "2025-09-04 03:56:41.329180", + "step": 1290, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:41.433827", + "step": 1290, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.005769069772213697, + "timestamp": "2025-09-04 03:56:41.452928", + "step": 1291, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:41.556488", + "step": 1291, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007577816490083933, + "timestamp": "2025-09-04 03:56:41.576092", + "step": 1292, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:41.667203", + "step": 1292, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.029849909245967865, + "timestamp": "2025-09-04 03:56:41.685829", + "step": 1293, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:56:41.795843", + "step": 1293, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.04431498050689697, + "timestamp": "2025-09-04 03:56:41.816259", + "step": 1294, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:41.919388", + "step": 1294, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.01207160297781229, + "timestamp": "2025-09-04 03:56:41.938071", + "step": 1295, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 03:56:42.072302", + "step": 1295, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.007530734874308109, + "timestamp": "2025-09-04 03:56:42.097544", + "step": 1296, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:42.187551", + "step": 1296, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.030552592128515244, + "timestamp": "2025-09-04 03:56:42.206373", + "step": 1297, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:56:42.283972", + "step": 1297, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.026529986411333084, + "timestamp": "2025-09-04 03:56:42.297966", + "step": 1298, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:56:42.390767", + "step": 1298, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.06174427270889282, + "timestamp": "2025-09-04 03:56:42.407715", + "step": 1299, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:56:42.524343", + "step": 1299, + "epoch": 1 + }, + { + "type": "loss", + "content": 0.017831875011324883, + "timestamp": "2025-09-04 03:56:42.545459", + "step": 1300, + "epoch": 1 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:56:50.958266", + "step": 1300, + "epoch": 1 + }, + { + "type": "pplx", + "content": 344.45454539575513, + "timestamp": "2025-09-04 03:56:50.960462", + "step": 1300, + "epoch": 1 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:56:51.033210", + "step": 1300, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00863197073340416, + "timestamp": "2025-09-04 03:56:51.048203", + "step": 1301, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:51.152787", + "step": 1301, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03207479417324066, + "timestamp": "2025-09-04 03:56:51.171776", + "step": 1302, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:56:51.272855", + "step": 1302, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03501874580979347, + "timestamp": "2025-09-04 03:56:51.291496", + "step": 1303, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:56:51.395438", + "step": 1303, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.040076903998851776, + "timestamp": "2025-09-04 03:56:51.415398", + "step": 1304, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:51.507169", + "step": 1304, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06259066611528397, + "timestamp": "2025-09-04 03:56:51.526093", + "step": 1305, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:51.621169", + "step": 1305, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005843855440616608, + "timestamp": "2025-09-04 03:56:51.638364", + "step": 1306, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:51.746720", + "step": 1306, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016627954319119453, + "timestamp": "2025-09-04 03:56:51.766813", + "step": 1307, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:56:51.870051", + "step": 1307, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01708150841295719, + "timestamp": "2025-09-04 03:56:51.889764", + "step": 1308, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:56:51.993489", + "step": 1308, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02616330236196518, + "timestamp": "2025-09-04 03:56:52.015383", + "step": 1309, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:56:52.110091", + "step": 1309, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031908308155834675, + "timestamp": "2025-09-04 03:56:52.127233", + "step": 1310, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:56:52.199864", + "step": 1310, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03640008345246315, + "timestamp": "2025-09-04 03:56:52.212599", + "step": 1311, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:52.308612", + "step": 1311, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009561690501868725, + "timestamp": "2025-09-04 03:56:52.326635", + "step": 1312, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:56:52.424790", + "step": 1312, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04150993376970291, + "timestamp": "2025-09-04 03:56:52.445011", + "step": 1313, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:56:52.536780", + "step": 1313, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019684717059135437, + "timestamp": "2025-09-04 03:56:52.552190", + "step": 1314, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:56:52.661011", + "step": 1314, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011558826081454754, + "timestamp": "2025-09-04 03:56:52.681084", + "step": 1315, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:56:52.767800", + "step": 1315, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01043105311691761, + "timestamp": "2025-09-04 03:56:52.783965", + "step": 1316, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:56:52.877263", + "step": 1316, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02803068794310093, + "timestamp": "2025-09-04 03:56:52.896318", + "step": 1317, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:56:52.973511", + "step": 1317, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09199260920286179, + "timestamp": "2025-09-04 03:56:52.987265", + "step": 1318, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:56:53.059699", + "step": 1318, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00931278895586729, + "timestamp": "2025-09-04 03:56:53.072457", + "step": 1319, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:56:53.183382", + "step": 1319, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008658390492200851, + "timestamp": "2025-09-04 03:56:53.204794", + "step": 1320, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:01.599545", + "step": 1320, + "epoch": 2 + }, + { + "type": "pplx", + "content": 351.6464470611636, + "timestamp": "2025-09-04 03:57:01.601996", + "step": 1320, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1320", + "timestamp": "2025-09-04 03:57:02.093639", + "step": 1320, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:02.198545", + "step": 1320, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009841070510447025, + "timestamp": "2025-09-04 03:57:02.220801", + "step": 1321, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1392 + ], + "flops": 27840169073088.0 + }, + "timestamp": "2025-09-04 03:57:02.426115", + "step": 1321, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016447851434350014, + "timestamp": "2025-09-04 03:57:02.465621", + "step": 1322, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:57:02.576052", + "step": 1322, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030630143359303474, + "timestamp": "2025-09-04 03:57:02.596668", + "step": 1323, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 03:57:02.812115", + "step": 1323, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03536463528871536, + "timestamp": "2025-09-04 03:57:02.853461", + "step": 1324, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:02.955808", + "step": 1324, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008404337801039219, + "timestamp": "2025-09-04 03:57:02.976945", + "step": 1325, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:03.085024", + "step": 1325, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02297317609190941, + "timestamp": "2025-09-04 03:57:03.104900", + "step": 1326, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:03.195185", + "step": 1326, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0200307946652174, + "timestamp": "2025-09-04 03:57:03.212065", + "step": 1327, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:03.311518", + "step": 1327, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006862281356006861, + "timestamp": "2025-09-04 03:57:03.330810", + "step": 1328, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:03.436212", + "step": 1328, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.13922302424907684, + "timestamp": "2025-09-04 03:57:03.458300", + "step": 1329, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:57:03.567713", + "step": 1329, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028604896739125252, + "timestamp": "2025-09-04 03:57:03.588279", + "step": 1330, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:03.690531", + "step": 1330, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015768418088555336, + "timestamp": "2025-09-04 03:57:03.709730", + "step": 1331, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:03.803908", + "step": 1331, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021719908341765404, + "timestamp": "2025-09-04 03:57:03.821947", + "step": 1332, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:57:03.896565", + "step": 1332, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02307227812707424, + "timestamp": "2025-09-04 03:57:03.911261", + "step": 1333, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:04.002837", + "step": 1333, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0028728533070534468, + "timestamp": "2025-09-04 03:57:04.019376", + "step": 1334, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:04.122476", + "step": 1334, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03315138444304466, + "timestamp": "2025-09-04 03:57:04.141454", + "step": 1335, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:04.236926", + "step": 1335, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008455898612737656, + "timestamp": "2025-09-04 03:57:04.255020", + "step": 1336, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:04.361785", + "step": 1336, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1121997982263565, + "timestamp": "2025-09-04 03:57:04.384050", + "step": 1337, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:04.497628", + "step": 1337, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017756765708327293, + "timestamp": "2025-09-04 03:57:04.517733", + "step": 1338, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:57:04.603610", + "step": 1338, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00965754222124815, + "timestamp": "2025-09-04 03:57:04.618677", + "step": 1339, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:57:04.696431", + "step": 1339, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02266879379749298, + "timestamp": "2025-09-04 03:57:04.711103", + "step": 1340, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:13.127580", + "step": 1340, + "epoch": 2 + }, + { + "type": "pplx", + "content": 356.8557867994047, + "timestamp": "2025-09-04 03:57:13.129660", + "step": 1340, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:57:13.211956", + "step": 1340, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028563033789396286, + "timestamp": "2025-09-04 03:57:13.228882", + "step": 1341, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:57:13.330240", + "step": 1341, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04276951029896736, + "timestamp": "2025-09-04 03:57:13.349088", + "step": 1342, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:13.455596", + "step": 1342, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017385128885507584, + "timestamp": "2025-09-04 03:57:13.475336", + "step": 1343, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:13.559205", + "step": 1343, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009770605713129044, + "timestamp": "2025-09-04 03:57:13.574334", + "step": 1344, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:57:13.650919", + "step": 1344, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01318974420428276, + "timestamp": "2025-09-04 03:57:13.665861", + "step": 1345, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:13.774914", + "step": 1345, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004364403896033764, + "timestamp": "2025-09-04 03:57:13.795071", + "step": 1346, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:13.899168", + "step": 1346, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007132431026548147, + "timestamp": "2025-09-04 03:57:13.918237", + "step": 1347, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:14.009956", + "step": 1347, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012063604779541492, + "timestamp": "2025-09-04 03:57:14.027233", + "step": 1348, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:14.125654", + "step": 1348, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020999517291784286, + "timestamp": "2025-09-04 03:57:14.145819", + "step": 1349, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:14.239683", + "step": 1349, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007064122706651688, + "timestamp": "2025-09-04 03:57:14.256533", + "step": 1350, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:14.359985", + "step": 1350, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00644453801214695, + "timestamp": "2025-09-04 03:57:14.378894", + "step": 1351, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:14.465017", + "step": 1351, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004426640458405018, + "timestamp": "2025-09-04 03:57:14.480574", + "step": 1352, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:14.586535", + "step": 1352, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00213529821485281, + "timestamp": "2025-09-04 03:57:14.608254", + "step": 1353, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:14.703837", + "step": 1353, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009575147181749344, + "timestamp": "2025-09-04 03:57:14.721141", + "step": 1354, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:14.812433", + "step": 1354, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009818262420594692, + "timestamp": "2025-09-04 03:57:14.829234", + "step": 1355, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:14.907815", + "step": 1355, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006910801865160465, + "timestamp": "2025-09-04 03:57:14.922554", + "step": 1356, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:57:15.020511", + "step": 1356, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025037406012415886, + "timestamp": "2025-09-04 03:57:15.040919", + "step": 1357, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:15.141088", + "step": 1357, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.058873310685157776, + "timestamp": "2025-09-04 03:57:15.159416", + "step": 1358, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:15.257132", + "step": 1358, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008470847271382809, + "timestamp": "2025-09-04 03:57:15.274434", + "step": 1359, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:15.380421", + "step": 1359, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038173649460077286, + "timestamp": "2025-09-04 03:57:15.400957", + "step": 1360, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:23.804518", + "step": 1360, + "epoch": 2 + }, + { + "type": "pplx", + "content": 358.21292183839546, + "timestamp": "2025-09-04 03:57:23.806482", + "step": 1360, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1360", + "timestamp": "2025-09-04 03:57:24.379494", + "step": 1360, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:24.471417", + "step": 1360, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003160255728289485, + "timestamp": "2025-09-04 03:57:24.490511", + "step": 1361, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:57:24.600621", + "step": 1361, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01291283406317234, + "timestamp": "2025-09-04 03:57:24.620887", + "step": 1362, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:24.712441", + "step": 1362, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01215626671910286, + "timestamp": "2025-09-04 03:57:24.729005", + "step": 1363, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 03:57:24.949046", + "step": 1363, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007885631173849106, + "timestamp": "2025-09-04 03:57:24.992019", + "step": 1364, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:57:25.085325", + "step": 1364, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009847918525338173, + "timestamp": "2025-09-04 03:57:25.104269", + "step": 1365, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:25.212570", + "step": 1365, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.056854456663131714, + "timestamp": "2025-09-04 03:57:25.232642", + "step": 1366, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 03:57:25.361488", + "step": 1366, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05310133844614029, + "timestamp": "2025-09-04 03:57:25.384890", + "step": 1367, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:25.492476", + "step": 1367, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014655319042503834, + "timestamp": "2025-09-04 03:57:25.512923", + "step": 1368, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:57:25.595251", + "step": 1368, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012807800434529781, + "timestamp": "2025-09-04 03:57:25.611697", + "step": 1369, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:25.714660", + "step": 1369, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012346881441771984, + "timestamp": "2025-09-04 03:57:25.733731", + "step": 1370, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:25.812931", + "step": 1370, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.08287136256694794, + "timestamp": "2025-09-04 03:57:25.826896", + "step": 1371, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:25.934910", + "step": 1371, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020002219825983047, + "timestamp": "2025-09-04 03:57:25.955901", + "step": 1372, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:26.064243", + "step": 1372, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11615262180566788, + "timestamp": "2025-09-04 03:57:26.084428", + "step": 1373, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:57:26.196826", + "step": 1373, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006950510665774345, + "timestamp": "2025-09-04 03:57:26.217171", + "step": 1374, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:26.324298", + "step": 1374, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008125067688524723, + "timestamp": "2025-09-04 03:57:26.344166", + "step": 1375, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:57:26.427863", + "step": 1375, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027247097343206406, + "timestamp": "2025-09-04 03:57:26.443748", + "step": 1376, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:26.525142", + "step": 1376, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010083671659231186, + "timestamp": "2025-09-04 03:57:26.541440", + "step": 1377, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:26.641242", + "step": 1377, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021171875298023224, + "timestamp": "2025-09-04 03:57:26.659635", + "step": 1378, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:57:26.770114", + "step": 1378, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05313687399029732, + "timestamp": "2025-09-04 03:57:26.790735", + "step": 1379, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:26.897067", + "step": 1379, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009001716040074825, + "timestamp": "2025-09-04 03:57:26.917597", + "step": 1380, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:35.371165", + "step": 1380, + "epoch": 2 + }, + { + "type": "pplx", + "content": 356.8696831217775, + "timestamp": "2025-09-04 03:57:35.374916", + "step": 1380, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:35.479063", + "step": 1380, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03470727056264877, + "timestamp": "2025-09-04 03:57:35.501369", + "step": 1381, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:35.598653", + "step": 1381, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04004732519388199, + "timestamp": "2025-09-04 03:57:35.616139", + "step": 1382, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:35.722177", + "step": 1382, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03036014549434185, + "timestamp": "2025-09-04 03:57:35.741436", + "step": 1383, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:35.849375", + "step": 1383, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05383269488811493, + "timestamp": "2025-09-04 03:57:35.870112", + "step": 1384, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:35.963863", + "step": 1384, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.10220647603273392, + "timestamp": "2025-09-04 03:57:35.983063", + "step": 1385, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:57:36.070880", + "step": 1385, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010271705687046051, + "timestamp": "2025-09-04 03:57:36.086507", + "step": 1386, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:36.190060", + "step": 1386, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018055927008390427, + "timestamp": "2025-09-04 03:57:36.209350", + "step": 1387, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:36.288183", + "step": 1387, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0347399078309536, + "timestamp": "2025-09-04 03:57:36.303106", + "step": 1388, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:57:36.374190", + "step": 1388, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017108459025621414, + "timestamp": "2025-09-04 03:57:36.388356", + "step": 1389, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:36.467290", + "step": 1389, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019396977499127388, + "timestamp": "2025-09-04 03:57:36.481460", + "step": 1390, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:36.584044", + "step": 1390, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029010986909270287, + "timestamp": "2025-09-04 03:57:36.603372", + "step": 1391, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:57:36.688236", + "step": 1391, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05589752271771431, + "timestamp": "2025-09-04 03:57:36.704586", + "step": 1392, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:36.795039", + "step": 1392, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027025602757930756, + "timestamp": "2025-09-04 03:57:36.813941", + "step": 1393, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:57:36.936119", + "step": 1393, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015675474423915148, + "timestamp": "2025-09-04 03:57:36.959466", + "step": 1394, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:37.044245", + "step": 1394, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02568882144987583, + "timestamp": "2025-09-04 03:57:37.059306", + "step": 1395, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:37.158661", + "step": 1395, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06579367071390152, + "timestamp": "2025-09-04 03:57:37.178098", + "step": 1396, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:57:37.275495", + "step": 1396, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013132071122527122, + "timestamp": "2025-09-04 03:57:37.296202", + "step": 1397, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:57:37.402412", + "step": 1397, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00368613563477993, + "timestamp": "2025-09-04 03:57:37.422511", + "step": 1398, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:37.520438", + "step": 1398, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027636591345071793, + "timestamp": "2025-09-04 03:57:37.539120", + "step": 1399, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:37.633885", + "step": 1399, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01690077967941761, + "timestamp": "2025-09-04 03:57:37.652181", + "step": 1400, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:46.023669", + "step": 1400, + "epoch": 2 + }, + { + "type": "pplx", + "content": 355.6197814395891, + "timestamp": "2025-09-04 03:57:46.025721", + "step": 1400, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1400", + "timestamp": "2025-09-04 03:57:46.538383", + "step": 1400, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:57:46.671119", + "step": 1400, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035729728639125824, + "timestamp": "2025-09-04 03:57:46.699862", + "step": 1401, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:46.790258", + "step": 1401, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06573422253131866, + "timestamp": "2025-09-04 03:57:46.806890", + "step": 1402, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:57:46.909490", + "step": 1402, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014802222140133381, + "timestamp": "2025-09-04 03:57:46.928221", + "step": 1403, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:47.030134", + "step": 1403, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03306787088513374, + "timestamp": "2025-09-04 03:57:47.049863", + "step": 1404, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:57:47.133628", + "step": 1404, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014069135300815105, + "timestamp": "2025-09-04 03:57:47.150471", + "step": 1405, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:57:47.254620", + "step": 1405, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011790101416409016, + "timestamp": "2025-09-04 03:57:47.273633", + "step": 1406, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:57:47.357993", + "step": 1406, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.036905501037836075, + "timestamp": "2025-09-04 03:57:47.373168", + "step": 1407, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:57:47.490818", + "step": 1407, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04053547978401184, + "timestamp": "2025-09-04 03:57:47.513458", + "step": 1408, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:47.610652", + "step": 1408, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.10574720799922943, + "timestamp": "2025-09-04 03:57:47.630819", + "step": 1409, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:47.726719", + "step": 1409, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03774774447083473, + "timestamp": "2025-09-04 03:57:47.743637", + "step": 1410, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:57:47.855271", + "step": 1410, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11015975475311279, + "timestamp": "2025-09-04 03:57:47.875714", + "step": 1411, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:47.975623", + "step": 1411, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031532928347587585, + "timestamp": "2025-09-04 03:57:47.994819", + "step": 1412, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:48.084984", + "step": 1412, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017254313454031944, + "timestamp": "2025-09-04 03:57:48.103711", + "step": 1413, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:57:48.180312", + "step": 1413, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.12299531698226929, + "timestamp": "2025-09-04 03:57:48.194006", + "step": 1414, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:48.289289", + "step": 1414, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011667820625007153, + "timestamp": "2025-09-04 03:57:48.306300", + "step": 1415, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:48.415936", + "step": 1415, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06725858896970749, + "timestamp": "2025-09-04 03:57:48.435803", + "step": 1416, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:57:48.528418", + "step": 1416, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022478509694337845, + "timestamp": "2025-09-04 03:57:48.547300", + "step": 1417, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:48.656960", + "step": 1417, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034010887145996094, + "timestamp": "2025-09-04 03:57:48.677147", + "step": 1418, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:48.775323", + "step": 1418, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030013922601938248, + "timestamp": "2025-09-04 03:57:48.792730", + "step": 1419, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:57:48.910608", + "step": 1419, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004974214360117912, + "timestamp": "2025-09-04 03:57:48.933542", + "step": 1420, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:57:57.300582", + "step": 1420, + "epoch": 2 + }, + { + "type": "pplx", + "content": 352.19748924586287, + "timestamp": "2025-09-04 03:57:57.303117", + "step": 1420, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 03:57:57.483232", + "step": 1420, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004329991061240435, + "timestamp": "2025-09-04 03:57:57.521231", + "step": 1421, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:57.605182", + "step": 1421, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012752629816532135, + "timestamp": "2025-09-04 03:57:57.620094", + "step": 1422, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:57:57.737791", + "step": 1422, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011430691927671432, + "timestamp": "2025-09-04 03:57:57.759708", + "step": 1423, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:57:57.834971", + "step": 1423, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09056932479143143, + "timestamp": "2025-09-04 03:57:57.849283", + "step": 1424, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:57.928832", + "step": 1424, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05736755579710007, + "timestamp": "2025-09-04 03:57:57.944122", + "step": 1425, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:57:58.034731", + "step": 1425, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010904895141720772, + "timestamp": "2025-09-04 03:57:58.051389", + "step": 1426, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:57:58.161440", + "step": 1426, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015301964245736599, + "timestamp": "2025-09-04 03:57:58.181793", + "step": 1427, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:57:58.278459", + "step": 1427, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006432659458369017, + "timestamp": "2025-09-04 03:57:58.296416", + "step": 1428, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:57:58.393057", + "step": 1428, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04648457467556, + "timestamp": "2025-09-04 03:57:58.413285", + "step": 1429, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:57:58.498896", + "step": 1429, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012602617032825947, + "timestamp": "2025-09-04 03:57:58.513769", + "step": 1430, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:57:58.622078", + "step": 1430, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00956171378493309, + "timestamp": "2025-09-04 03:57:58.642137", + "step": 1431, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:57:58.736074", + "step": 1431, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.20023715496063232, + "timestamp": "2025-09-04 03:57:58.753934", + "step": 1432, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:57:58.831296", + "step": 1432, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03244839608669281, + "timestamp": "2025-09-04 03:57:58.846617", + "step": 1433, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:57:58.947316", + "step": 1433, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019599279388785362, + "timestamp": "2025-09-04 03:57:58.965931", + "step": 1434, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:57:59.083309", + "step": 1434, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016808245331048965, + "timestamp": "2025-09-04 03:57:59.105217", + "step": 1435, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:57:59.216073", + "step": 1435, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017745893448591232, + "timestamp": "2025-09-04 03:57:59.237285", + "step": 1436, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:59.336156", + "step": 1436, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007716418243944645, + "timestamp": "2025-09-04 03:57:59.356987", + "step": 1437, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:59.452348", + "step": 1437, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0655415803194046, + "timestamp": "2025-09-04 03:57:59.469660", + "step": 1438, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:57:59.572428", + "step": 1438, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.039280664175748825, + "timestamp": "2025-09-04 03:57:59.591378", + "step": 1439, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:57:59.686127", + "step": 1439, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.047219302505254745, + "timestamp": "2025-09-04 03:57:59.704167", + "step": 1440, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:58:08.097411", + "step": 1440, + "epoch": 2 + }, + { + "type": "pplx", + "content": 347.4976195756163, + "timestamp": "2025-09-04 03:58:08.099469", + "step": 1440, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1440", + "timestamp": "2025-09-04 03:58:08.613639", + "step": 1440, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:08.712984", + "step": 1440, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022071918472647667, + "timestamp": "2025-09-04 03:58:08.733921", + "step": 1441, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:08.842886", + "step": 1441, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02069196105003357, + "timestamp": "2025-09-04 03:58:08.862926", + "step": 1442, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:08.972102", + "step": 1442, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001259633689187467, + "timestamp": "2025-09-04 03:58:08.992159", + "step": 1443, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:09.098857", + "step": 1443, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05347844213247299, + "timestamp": "2025-09-04 03:58:09.119414", + "step": 1444, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 03:58:09.245780", + "step": 1444, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05717499926686287, + "timestamp": "2025-09-04 03:58:09.272832", + "step": 1445, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:09.373325", + "step": 1445, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.040185511112213135, + "timestamp": "2025-09-04 03:58:09.392288", + "step": 1446, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:09.500710", + "step": 1446, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024745700880885124, + "timestamp": "2025-09-04 03:58:09.520583", + "step": 1447, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:09.628530", + "step": 1447, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011510748416185379, + "timestamp": "2025-09-04 03:58:09.649508", + "step": 1448, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:58:09.734063", + "step": 1448, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014999719336628914, + "timestamp": "2025-09-04 03:58:09.750818", + "step": 1449, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:58:09.861652", + "step": 1449, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006128712557256222, + "timestamp": "2025-09-04 03:58:09.882168", + "step": 1450, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:09.985238", + "step": 1450, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013433975167572498, + "timestamp": "2025-09-04 03:58:10.004317", + "step": 1451, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:58:10.081730", + "step": 1451, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007306399755179882, + "timestamp": "2025-09-04 03:58:10.096553", + "step": 1452, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:58:10.188094", + "step": 1452, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004065982531756163, + "timestamp": "2025-09-04 03:58:10.207086", + "step": 1453, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:10.307494", + "step": 1453, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005130627192556858, + "timestamp": "2025-09-04 03:58:10.326129", + "step": 1454, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:58:10.404524", + "step": 1454, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01915198192000389, + "timestamp": "2025-09-04 03:58:10.418656", + "step": 1455, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:10.519421", + "step": 1455, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006536707282066345, + "timestamp": "2025-09-04 03:58:10.538875", + "step": 1456, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:58:10.645308", + "step": 1456, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029277930036187172, + "timestamp": "2025-09-04 03:58:10.667816", + "step": 1457, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:58:10.784355", + "step": 1457, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01957911066710949, + "timestamp": "2025-09-04 03:58:10.806480", + "step": 1458, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:58:10.916254", + "step": 1458, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009834788739681244, + "timestamp": "2025-09-04 03:58:10.936503", + "step": 1459, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:11.044736", + "step": 1459, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.040386419743299484, + "timestamp": "2025-09-04 03:58:11.065853", + "step": 1460, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:58:19.451938", + "step": 1460, + "epoch": 2 + }, + { + "type": "pplx", + "content": 348.4467862100084, + "timestamp": "2025-09-04 03:58:19.453968", + "step": 1460, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:58:19.549185", + "step": 1460, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05862518399953842, + "timestamp": "2025-09-04 03:58:19.569593", + "step": 1461, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:58:19.653427", + "step": 1461, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010720998048782349, + "timestamp": "2025-09-04 03:58:19.668367", + "step": 1462, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:19.769165", + "step": 1462, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0041922107338905334, + "timestamp": "2025-09-04 03:58:19.788193", + "step": 1463, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:58:19.871897", + "step": 1463, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02185150980949402, + "timestamp": "2025-09-04 03:58:19.887593", + "step": 1464, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:19.978920", + "step": 1464, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02324114739894867, + "timestamp": "2025-09-04 03:58:19.997814", + "step": 1465, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:58:20.089399", + "step": 1465, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01927514187991619, + "timestamp": "2025-09-04 03:58:20.105963", + "step": 1466, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:58:20.200168", + "step": 1466, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029533013701438904, + "timestamp": "2025-09-04 03:58:20.217003", + "step": 1467, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:20.321673", + "step": 1467, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06191162019968033, + "timestamp": "2025-09-04 03:58:20.341487", + "step": 1468, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:20.444498", + "step": 1468, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008275226689875126, + "timestamp": "2025-09-04 03:58:20.466423", + "step": 1469, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:20.562481", + "step": 1469, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020667975768446922, + "timestamp": "2025-09-04 03:58:20.579977", + "step": 1470, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:58:20.653299", + "step": 1470, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023650793358683586, + "timestamp": "2025-09-04 03:58:20.665998", + "step": 1471, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:20.767543", + "step": 1471, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02776520885527134, + "timestamp": "2025-09-04 03:58:20.786811", + "step": 1472, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:20.864142", + "step": 1472, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01794314943253994, + "timestamp": "2025-09-04 03:58:20.878780", + "step": 1473, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:20.990087", + "step": 1473, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028163762763142586, + "timestamp": "2025-09-04 03:58:21.009697", + "step": 1474, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:21.118248", + "step": 1474, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016635507345199585, + "timestamp": "2025-09-04 03:58:21.137598", + "step": 1475, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:58:21.223856", + "step": 1475, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09255839139223099, + "timestamp": "2025-09-04 03:58:21.239630", + "step": 1476, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:21.338871", + "step": 1476, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03798213601112366, + "timestamp": "2025-09-04 03:58:21.359255", + "step": 1477, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:21.460978", + "step": 1477, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02084563672542572, + "timestamp": "2025-09-04 03:58:21.479589", + "step": 1478, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:58:21.563333", + "step": 1478, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009020745754241943, + "timestamp": "2025-09-04 03:58:21.578184", + "step": 1479, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:21.688017", + "step": 1479, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0042407093569636345, + "timestamp": "2025-09-04 03:58:21.708855", + "step": 1480, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:58:30.096181", + "step": 1480, + "epoch": 2 + }, + { + "type": "pplx", + "content": 350.9113278716737, + "timestamp": "2025-09-04 03:58:30.098321", + "step": 1480, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1480", + "timestamp": "2025-09-04 03:58:30.453234", + "step": 1480, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:30.528236", + "step": 1480, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.042050596326589584, + "timestamp": "2025-09-04 03:58:30.543158", + "step": 1481, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:30.651692", + "step": 1481, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023750372231006622, + "timestamp": "2025-09-04 03:58:30.672015", + "step": 1482, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:30.776255", + "step": 1482, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008751154877245426, + "timestamp": "2025-09-04 03:58:30.795315", + "step": 1483, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:30.872932", + "step": 1483, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013106334023177624, + "timestamp": "2025-09-04 03:58:30.887472", + "step": 1484, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 03:58:31.005957", + "step": 1484, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029924411326646805, + "timestamp": "2025-09-04 03:58:31.029590", + "step": 1485, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:58:31.130014", + "step": 1485, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024293435737490654, + "timestamp": "2025-09-04 03:58:31.148274", + "step": 1486, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:31.257554", + "step": 1486, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06716310232877731, + "timestamp": "2025-09-04 03:58:31.277164", + "step": 1487, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:31.384124", + "step": 1487, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02108912356197834, + "timestamp": "2025-09-04 03:58:31.404686", + "step": 1488, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:31.481010", + "step": 1488, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07503022998571396, + "timestamp": "2025-09-04 03:58:31.496061", + "step": 1489, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:58:31.581333", + "step": 1489, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0028276981320232153, + "timestamp": "2025-09-04 03:58:31.596330", + "step": 1490, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:58:31.689572", + "step": 1490, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02281711809337139, + "timestamp": "2025-09-04 03:58:31.706718", + "step": 1491, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:58:31.816423", + "step": 1491, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026799194514751434, + "timestamp": "2025-09-04 03:58:31.837569", + "step": 1492, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:58:31.945511", + "step": 1492, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027529401704669, + "timestamp": "2025-09-04 03:58:31.968207", + "step": 1493, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:32.077972", + "step": 1493, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0032774689607322216, + "timestamp": "2025-09-04 03:58:32.097948", + "step": 1494, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:32.193448", + "step": 1494, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03091510199010372, + "timestamp": "2025-09-04 03:58:32.210579", + "step": 1495, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:32.289594", + "step": 1495, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018100092187523842, + "timestamp": "2025-09-04 03:58:32.303714", + "step": 1496, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:32.405510", + "step": 1496, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06709955632686615, + "timestamp": "2025-09-04 03:58:32.425599", + "step": 1497, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:32.526213", + "step": 1497, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016714708879590034, + "timestamp": "2025-09-04 03:58:32.543025", + "step": 1498, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:58:32.641548", + "step": 1498, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009932179003953934, + "timestamp": "2025-09-04 03:58:32.658618", + "step": 1499, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:58:32.771050", + "step": 1499, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002495410619303584, + "timestamp": "2025-09-04 03:58:32.792042", + "step": 1500, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:58:41.271072", + "step": 1500, + "epoch": 2 + }, + { + "type": "pplx", + "content": 352.31429226333887, + "timestamp": "2025-09-04 03:58:41.273723", + "step": 1500, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:58:41.354514", + "step": 1500, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11568693816661835, + "timestamp": "2025-09-04 03:58:41.371425", + "step": 1501, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:41.475740", + "step": 1501, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01670590229332447, + "timestamp": "2025-09-04 03:58:41.494996", + "step": 1502, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:58:41.589828", + "step": 1502, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019445618614554405, + "timestamp": "2025-09-04 03:58:41.606867", + "step": 1503, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:58:41.685355", + "step": 1503, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.057172223925590515, + "timestamp": "2025-09-04 03:58:41.700285", + "step": 1504, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:41.800836", + "step": 1504, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00811680406332016, + "timestamp": "2025-09-04 03:58:41.821852", + "step": 1505, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:41.904781", + "step": 1505, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04069969430565834, + "timestamp": "2025-09-04 03:58:41.918645", + "step": 1506, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:58:42.006065", + "step": 1506, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011170770972967148, + "timestamp": "2025-09-04 03:58:42.019930", + "step": 1507, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:42.124593", + "step": 1507, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011762701906263828, + "timestamp": "2025-09-04 03:58:42.144303", + "step": 1508, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:42.250667", + "step": 1508, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004596356768161058, + "timestamp": "2025-09-04 03:58:42.272663", + "step": 1509, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:42.379939", + "step": 1509, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005542484112083912, + "timestamp": "2025-09-04 03:58:42.399644", + "step": 1510, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:42.504187", + "step": 1510, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.048978500068187714, + "timestamp": "2025-09-04 03:58:42.523284", + "step": 1511, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:42.627086", + "step": 1511, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0095089515671134, + "timestamp": "2025-09-04 03:58:42.646752", + "step": 1512, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:42.745841", + "step": 1512, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012734484858810902, + "timestamp": "2025-09-04 03:58:42.766995", + "step": 1513, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 03:58:42.971571", + "step": 1513, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0094110993668437, + "timestamp": "2025-09-04 03:58:43.010715", + "step": 1514, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:43.087980", + "step": 1514, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006989278364926577, + "timestamp": "2025-09-04 03:58:43.101850", + "step": 1515, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:43.202917", + "step": 1515, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03085504285991192, + "timestamp": "2025-09-04 03:58:43.222336", + "step": 1516, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:58:43.320624", + "step": 1516, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0065007261000573635, + "timestamp": "2025-09-04 03:58:43.340918", + "step": 1517, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:43.443356", + "step": 1517, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020510029047727585, + "timestamp": "2025-09-04 03:58:43.462464", + "step": 1518, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:58:43.561904", + "step": 1518, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010851016268134117, + "timestamp": "2025-09-04 03:58:43.580231", + "step": 1519, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:58:43.681348", + "step": 1519, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011947129853069782, + "timestamp": "2025-09-04 03:58:43.700623", + "step": 1520, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:58:52.103185", + "step": 1520, + "epoch": 2 + }, + { + "type": "pplx", + "content": 354.340881585885, + "timestamp": "2025-09-04 03:58:52.104933", + "step": 1520, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1520", + "timestamp": "2025-09-04 03:58:52.596824", + "step": 1520, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:52.671295", + "step": 1520, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004985039122402668, + "timestamp": "2025-09-04 03:58:52.686417", + "step": 1521, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:52.763053", + "step": 1521, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04416259378194809, + "timestamp": "2025-09-04 03:58:52.776998", + "step": 1522, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:58:52.885793", + "step": 1522, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07206206768751144, + "timestamp": "2025-09-04 03:58:52.906087", + "step": 1523, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:58:52.999974", + "step": 1523, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011458688415586948, + "timestamp": "2025-09-04 03:58:53.017860", + "step": 1524, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:53.116338", + "step": 1524, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014014442451298237, + "timestamp": "2025-09-04 03:58:53.137043", + "step": 1525, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:53.238547", + "step": 1525, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007480642758309841, + "timestamp": "2025-09-04 03:58:53.257330", + "step": 1526, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 03:58:53.477272", + "step": 1526, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0391855351626873, + "timestamp": "2025-09-04 03:58:53.519609", + "step": 1527, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:58:53.622364", + "step": 1527, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002516398439183831, + "timestamp": "2025-09-04 03:58:53.642357", + "step": 1528, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:58:53.717657", + "step": 1528, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028322208672761917, + "timestamp": "2025-09-04 03:58:53.732958", + "step": 1529, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1200 + ], + "flops": 24000145761984.0 + }, + "timestamp": "2025-09-04 03:58:53.908721", + "step": 1529, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025973438750952482, + "timestamp": "2025-09-04 03:58:53.941662", + "step": 1530, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:54.036743", + "step": 1530, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06388135254383087, + "timestamp": "2025-09-04 03:58:54.054194", + "step": 1531, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:58:54.160713", + "step": 1531, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006396736484020948, + "timestamp": "2025-09-04 03:58:54.181459", + "step": 1532, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:58:54.286647", + "step": 1532, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028667569160461426, + "timestamp": "2025-09-04 03:58:54.308867", + "step": 1533, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:54.410070", + "step": 1533, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11603422462940216, + "timestamp": "2025-09-04 03:58:54.428882", + "step": 1534, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:54.534808", + "step": 1534, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011094557121396065, + "timestamp": "2025-09-04 03:58:54.553867", + "step": 1535, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:58:54.757887", + "step": 1535, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02706741727888584, + "timestamp": "2025-09-04 03:58:54.797893", + "step": 1536, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:58:54.892179", + "step": 1536, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01568935438990593, + "timestamp": "2025-09-04 03:58:54.911469", + "step": 1537, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:58:54.995117", + "step": 1537, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009411556646227837, + "timestamp": "2025-09-04 03:58:55.010256", + "step": 1538, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:58:55.118774", + "step": 1538, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01080233883112669, + "timestamp": "2025-09-04 03:58:55.138032", + "step": 1539, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:58:55.237800", + "step": 1539, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016464075073599815, + "timestamp": "2025-09-04 03:58:55.257398", + "step": 1540, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:03.697121", + "step": 1540, + "epoch": 2 + }, + { + "type": "pplx", + "content": 358.13731037704423, + "timestamp": "2025-09-04 03:59:03.699487", + "step": 1540, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:03.800069", + "step": 1540, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017000077292323112, + "timestamp": "2025-09-04 03:59:03.821025", + "step": 1541, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:59:03.926237", + "step": 1541, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018205707892775536, + "timestamp": "2025-09-04 03:59:03.945068", + "step": 1542, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:59:04.069382", + "step": 1542, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032110054045915604, + "timestamp": "2025-09-04 03:59:04.091913", + "step": 1543, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:04.202077", + "step": 1543, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0027143440674990416, + "timestamp": "2025-09-04 03:59:04.222663", + "step": 1544, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:59:04.332288", + "step": 1544, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030216865241527557, + "timestamp": "2025-09-04 03:59:04.354365", + "step": 1545, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:04.465187", + "step": 1545, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03675489500164986, + "timestamp": "2025-09-04 03:59:04.484744", + "step": 1546, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:59:04.562572", + "step": 1546, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03915338218212128, + "timestamp": "2025-09-04 03:59:04.576156", + "step": 1547, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:59:04.714903", + "step": 1547, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0727081373333931, + "timestamp": "2025-09-04 03:59:04.741282", + "step": 1548, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:59:04.819762", + "step": 1548, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03566645830869675, + "timestamp": "2025-09-04 03:59:04.834687", + "step": 1549, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:04.939334", + "step": 1549, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020596234127879143, + "timestamp": "2025-09-04 03:59:04.957967", + "step": 1550, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:05.060642", + "step": 1550, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00818456057459116, + "timestamp": "2025-09-04 03:59:05.078828", + "step": 1551, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:05.189583", + "step": 1551, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05260307341814041, + "timestamp": "2025-09-04 03:59:05.210406", + "step": 1552, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:05.313980", + "step": 1552, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006511691492050886, + "timestamp": "2025-09-04 03:59:05.334487", + "step": 1553, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:05.440815", + "step": 1553, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009990708902478218, + "timestamp": "2025-09-04 03:59:05.459864", + "step": 1554, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 03:59:05.636814", + "step": 1554, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004030467942357063, + "timestamp": "2025-09-04 03:59:05.668857", + "step": 1555, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:59:05.756761", + "step": 1555, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031922031193971634, + "timestamp": "2025-09-04 03:59:05.772959", + "step": 1556, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:05.865809", + "step": 1556, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021661344915628433, + "timestamp": "2025-09-04 03:59:05.884706", + "step": 1557, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:05.979801", + "step": 1557, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01415644958615303, + "timestamp": "2025-09-04 03:59:05.996754", + "step": 1558, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:06.091347", + "step": 1558, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0253834780305624, + "timestamp": "2025-09-04 03:59:06.108290", + "step": 1559, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:06.210438", + "step": 1559, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008007410913705826, + "timestamp": "2025-09-04 03:59:06.229849", + "step": 1560, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:14.729217", + "step": 1560, + "epoch": 2 + }, + { + "type": "pplx", + "content": 359.43894958396004, + "timestamp": "2025-09-04 03:59:14.731389", + "step": 1560, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1560", + "timestamp": "2025-09-04 03:59:15.095862", + "step": 1560, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:15.200111", + "step": 1560, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012292720377445221, + "timestamp": "2025-09-04 03:59:15.222316", + "step": 1561, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:15.322245", + "step": 1561, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03446212038397789, + "timestamp": "2025-09-04 03:59:15.340799", + "step": 1562, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:15.440100", + "step": 1562, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038441934157162905, + "timestamp": "2025-09-04 03:59:15.458765", + "step": 1563, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:59:15.569481", + "step": 1563, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009270180948078632, + "timestamp": "2025-09-04 03:59:15.590580", + "step": 1564, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:15.683992", + "step": 1564, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015130855608731508, + "timestamp": "2025-09-04 03:59:15.702558", + "step": 1565, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:59:15.812496", + "step": 1565, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001221196842379868, + "timestamp": "2025-09-04 03:59:15.833028", + "step": 1566, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 03:59:15.917093", + "step": 1566, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01908346638083458, + "timestamp": "2025-09-04 03:59:15.931975", + "step": 1567, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:59:16.037182", + "step": 1567, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010458694770932198, + "timestamp": "2025-09-04 03:59:16.056869", + "step": 1568, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 03:59:16.254852", + "step": 1568, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03718806803226471, + "timestamp": "2025-09-04 03:59:16.297722", + "step": 1569, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:59:16.414907", + "step": 1569, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006845841184258461, + "timestamp": "2025-09-04 03:59:16.437167", + "step": 1570, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:16.531656", + "step": 1570, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017309976741671562, + "timestamp": "2025-09-04 03:59:16.549235", + "step": 1571, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:16.656486", + "step": 1571, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038009658455848694, + "timestamp": "2025-09-04 03:59:16.677504", + "step": 1572, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:16.781767", + "step": 1572, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005971661768853664, + "timestamp": "2025-09-04 03:59:16.803460", + "step": 1573, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:59:16.939010", + "step": 1573, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0011978168040513992, + "timestamp": "2025-09-04 03:59:16.964950", + "step": 1574, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:59:17.050202", + "step": 1574, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06003192439675331, + "timestamp": "2025-09-04 03:59:17.065248", + "step": 1575, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:59:17.155313", + "step": 1575, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007113071624189615, + "timestamp": "2025-09-04 03:59:17.172663", + "step": 1576, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:17.269765", + "step": 1576, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012526609003543854, + "timestamp": "2025-09-04 03:59:17.289917", + "step": 1577, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 03:59:17.373409", + "step": 1577, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05469227954745293, + "timestamp": "2025-09-04 03:59:17.388299", + "step": 1578, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:17.481308", + "step": 1578, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020947255194187164, + "timestamp": "2025-09-04 03:59:17.498309", + "step": 1579, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:59:17.570426", + "step": 1579, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.049049898982048035, + "timestamp": "2025-09-04 03:59:17.583920", + "step": 1580, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:25.958273", + "step": 1580, + "epoch": 2 + }, + { + "type": "pplx", + "content": 357.77819634386793, + "timestamp": "2025-09-04 03:59:25.959793", + "step": 1580, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:59:26.031511", + "step": 1580, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022066203877329826, + "timestamp": "2025-09-04 03:59:26.046549", + "step": 1581, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:59:26.124819", + "step": 1581, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02584720402956009, + "timestamp": "2025-09-04 03:59:26.139065", + "step": 1582, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:26.233903", + "step": 1582, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004149719141423702, + "timestamp": "2025-09-04 03:59:26.251416", + "step": 1583, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:26.344125", + "step": 1583, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026423068717122078, + "timestamp": "2025-09-04 03:59:26.361696", + "step": 1584, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:26.458179", + "step": 1584, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014465868473052979, + "timestamp": "2025-09-04 03:59:26.478437", + "step": 1585, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 03:59:26.595337", + "step": 1585, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04363749548792839, + "timestamp": "2025-09-04 03:59:26.617500", + "step": 1586, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:26.711320", + "step": 1586, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021764691919088364, + "timestamp": "2025-09-04 03:59:26.728639", + "step": 1587, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 03:59:26.866445", + "step": 1587, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008350017480552197, + "timestamp": "2025-09-04 03:59:26.893339", + "step": 1588, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:26.983861", + "step": 1588, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038341108709573746, + "timestamp": "2025-09-04 03:59:27.002454", + "step": 1589, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:27.096991", + "step": 1589, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.033245623111724854, + "timestamp": "2025-09-04 03:59:27.114289", + "step": 1590, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:27.214904", + "step": 1590, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025423089042305946, + "timestamp": "2025-09-04 03:59:27.233747", + "step": 1591, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 03:59:27.314232", + "step": 1591, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07140816748142242, + "timestamp": "2025-09-04 03:59:27.328990", + "step": 1592, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:27.426941", + "step": 1592, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06752584129571915, + "timestamp": "2025-09-04 03:59:27.447667", + "step": 1593, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 03:59:27.519642", + "step": 1593, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06612498313188553, + "timestamp": "2025-09-04 03:59:27.532132", + "step": 1594, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:27.640395", + "step": 1594, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011722175404429436, + "timestamp": "2025-09-04 03:59:27.660381", + "step": 1595, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:27.759366", + "step": 1595, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008636832237243652, + "timestamp": "2025-09-04 03:59:27.778747", + "step": 1596, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:59:27.862555", + "step": 1596, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015985198318958282, + "timestamp": "2025-09-04 03:59:27.879266", + "step": 1597, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:27.977074", + "step": 1597, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011449925601482391, + "timestamp": "2025-09-04 03:59:27.994541", + "step": 1598, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:28.099296", + "step": 1598, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005783349275588989, + "timestamp": "2025-09-04 03:59:28.117980", + "step": 1599, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 03:59:28.227054", + "step": 1599, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017474643886089325, + "timestamp": "2025-09-04 03:59:28.248142", + "step": 1600, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:36.640768", + "step": 1600, + "epoch": 2 + }, + { + "type": "pplx", + "content": 352.57666281747646, + "timestamp": "2025-09-04 03:59:36.643336", + "step": 1600, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1600", + "timestamp": "2025-09-04 03:59:37.160974", + "step": 1600, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:37.258635", + "step": 1600, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004377053584903479, + "timestamp": "2025-09-04 03:59:37.278760", + "step": 1601, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:59:37.357328", + "step": 1601, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09427284449338913, + "timestamp": "2025-09-04 03:59:37.371013", + "step": 1602, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:37.468118", + "step": 1602, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006842142436653376, + "timestamp": "2025-09-04 03:59:37.485384", + "step": 1603, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:37.593709", + "step": 1603, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04602169245481491, + "timestamp": "2025-09-04 03:59:37.614356", + "step": 1604, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:37.717365", + "step": 1604, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0032124435529112816, + "timestamp": "2025-09-04 03:59:37.738360", + "step": 1605, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:37.850506", + "step": 1605, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0026126240845769644, + "timestamp": "2025-09-04 03:59:37.870407", + "step": 1606, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:59:37.962373", + "step": 1606, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0179721862077713, + "timestamp": "2025-09-04 03:59:37.979015", + "step": 1607, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:38.075975", + "step": 1607, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03628922253847122, + "timestamp": "2025-09-04 03:59:38.094017", + "step": 1608, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:38.196918", + "step": 1608, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.037857189774513245, + "timestamp": "2025-09-04 03:59:38.217694", + "step": 1609, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:59:38.295044", + "step": 1609, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01975720003247261, + "timestamp": "2025-09-04 03:59:38.308225", + "step": 1610, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:59:38.415102", + "step": 1610, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.049535252153873444, + "timestamp": "2025-09-04 03:59:38.433713", + "step": 1611, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:38.534698", + "step": 1611, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01265799067914486, + "timestamp": "2025-09-04 03:59:38.554118", + "step": 1612, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:59:38.638435", + "step": 1612, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09150931984186172, + "timestamp": "2025-09-04 03:59:38.655552", + "step": 1613, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:38.754553", + "step": 1613, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007259078789502382, + "timestamp": "2025-09-04 03:59:38.773143", + "step": 1614, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 03:59:38.858195", + "step": 1614, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02660290338099003, + "timestamp": "2025-09-04 03:59:38.873585", + "step": 1615, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 03:59:38.995688", + "step": 1615, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007390860002487898, + "timestamp": "2025-09-04 03:59:39.019381", + "step": 1616, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:39.113144", + "step": 1616, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016929682344198227, + "timestamp": "2025-09-04 03:59:39.132006", + "step": 1617, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:39.238588", + "step": 1617, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006743302568793297, + "timestamp": "2025-09-04 03:59:39.258325", + "step": 1618, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:39.373616", + "step": 1618, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017765972763299942, + "timestamp": "2025-09-04 03:59:39.393491", + "step": 1619, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:59:39.530286", + "step": 1619, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023081377148628235, + "timestamp": "2025-09-04 03:59:39.556794", + "step": 1620, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:47.940129", + "step": 1620, + "epoch": 2 + }, + { + "type": "pplx", + "content": 344.5752767610482, + "timestamp": "2025-09-04 03:59:47.944701", + "step": 1620, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 03:59:48.061987", + "step": 1620, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012214818969368935, + "timestamp": "2025-09-04 03:59:48.087346", + "step": 1621, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:48.182192", + "step": 1621, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025064749643206596, + "timestamp": "2025-09-04 03:59:48.199612", + "step": 1622, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:59:48.334327", + "step": 1622, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013956296257674694, + "timestamp": "2025-09-04 03:59:48.360138", + "step": 1623, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 03:59:48.459381", + "step": 1623, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02604658156633377, + "timestamp": "2025-09-04 03:59:48.478589", + "step": 1624, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:59:48.592076", + "step": 1624, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015420682728290558, + "timestamp": "2025-09-04 03:59:48.614371", + "step": 1625, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:59:48.717389", + "step": 1625, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02754286862909794, + "timestamp": "2025-09-04 03:59:48.736363", + "step": 1626, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 03:59:48.812596", + "step": 1626, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0014424566179513931, + "timestamp": "2025-09-04 03:59:48.826120", + "step": 1627, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:48.920932", + "step": 1627, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015860071405768394, + "timestamp": "2025-09-04 03:59:48.938622", + "step": 1628, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 03:59:49.029533", + "step": 1628, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022340765222907066, + "timestamp": "2025-09-04 03:59:49.048431", + "step": 1629, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 03:59:49.141575", + "step": 1629, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06079462915658951, + "timestamp": "2025-09-04 03:59:49.158388", + "step": 1630, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:49.259259", + "step": 1630, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024208059534430504, + "timestamp": "2025-09-04 03:59:49.277857", + "step": 1631, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:59:49.354899", + "step": 1631, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01721222698688507, + "timestamp": "2025-09-04 03:59:49.369440", + "step": 1632, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 03:59:49.462980", + "step": 1632, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023982705548405647, + "timestamp": "2025-09-04 03:59:49.482064", + "step": 1633, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:59:49.560407", + "step": 1633, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04129756614565849, + "timestamp": "2025-09-04 03:59:49.574243", + "step": 1634, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:49.677225", + "step": 1634, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010030495934188366, + "timestamp": "2025-09-04 03:59:49.696269", + "step": 1635, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 03:59:49.798566", + "step": 1635, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011695167981088161, + "timestamp": "2025-09-04 03:59:49.818213", + "step": 1636, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 03:59:49.906625", + "step": 1636, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038352031260728836, + "timestamp": "2025-09-04 03:59:49.924748", + "step": 1637, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 03:59:50.035679", + "step": 1637, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04370134696364403, + "timestamp": "2025-09-04 03:59:50.056082", + "step": 1638, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:50.155714", + "step": 1638, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0329529233276844, + "timestamp": "2025-09-04 03:59:50.174460", + "step": 1639, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 03:59:50.260151", + "step": 1639, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07019810378551483, + "timestamp": "2025-09-04 03:59:50.276294", + "step": 1640, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 03:59:58.677353", + "step": 1640, + "epoch": 2 + }, + { + "type": "pplx", + "content": 337.12616741483475, + "timestamp": "2025-09-04 03:59:58.679743", + "step": 1640, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1640", + "timestamp": "2025-09-04 03:59:59.042413", + "step": 1640, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 03:59:59.118171", + "step": 1640, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008293285965919495, + "timestamp": "2025-09-04 03:59:59.133120", + "step": 1641, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 03:59:59.207261", + "step": 1641, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032241858541965485, + "timestamp": "2025-09-04 03:59:59.220518", + "step": 1642, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 03:59:59.327128", + "step": 1642, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022532925009727478, + "timestamp": "2025-09-04 03:59:59.347081", + "step": 1643, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 03:59:59.456331", + "step": 1643, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02694687992334366, + "timestamp": "2025-09-04 03:59:59.477507", + "step": 1644, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 03:59:59.574944", + "step": 1644, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04047459363937378, + "timestamp": "2025-09-04 03:59:59.595569", + "step": 1645, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 03:59:59.733413", + "step": 1645, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029626868665218353, + "timestamp": "2025-09-04 03:59:59.759206", + "step": 1646, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 03:59:59.863484", + "step": 1646, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06880754232406616, + "timestamp": "2025-09-04 03:59:59.882549", + "step": 1647, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 03:59:59.955783", + "step": 1647, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01954088918864727, + "timestamp": "2025-09-04 03:59:59.969438", + "step": 1648, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:00.065814", + "step": 1648, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03680575639009476, + "timestamp": "2025-09-04 04:00:00.085936", + "step": 1649, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:00.190966", + "step": 1649, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05359693989157677, + "timestamp": "2025-09-04 04:00:00.210087", + "step": 1650, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:00:00.285825", + "step": 1650, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010023823007941246, + "timestamp": "2025-09-04 04:00:00.299422", + "step": 1651, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:00:00.382070", + "step": 1651, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012647191993892193, + "timestamp": "2025-09-04 04:00:00.397724", + "step": 1652, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:00.494834", + "step": 1652, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024116551503539085, + "timestamp": "2025-09-04 04:00:00.515369", + "step": 1653, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:00:00.602080", + "step": 1653, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012986271642148495, + "timestamp": "2025-09-04 04:00:00.617510", + "step": 1654, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:00:00.704538", + "step": 1654, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0540911890566349, + "timestamp": "2025-09-04 04:00:00.719990", + "step": 1655, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:00.820816", + "step": 1655, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007742506917566061, + "timestamp": "2025-09-04 04:00:00.840248", + "step": 1656, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:00:00.959478", + "step": 1656, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010772481560707092, + "timestamp": "2025-09-04 04:00:00.984758", + "step": 1657, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:00:01.065179", + "step": 1657, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02090446837246418, + "timestamp": "2025-09-04 04:00:01.079075", + "step": 1658, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:00:01.186014", + "step": 1658, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001743293716572225, + "timestamp": "2025-09-04 04:00:01.205789", + "step": 1659, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 04:00:01.334218", + "step": 1659, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010634549893438816, + "timestamp": "2025-09-04 04:00:01.359030", + "step": 1660, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:00:09.817753", + "step": 1660, + "epoch": 2 + }, + { + "type": "pplx", + "content": 331.6593289744211, + "timestamp": "2025-09-04 04:00:09.820816", + "step": 1660, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:00:09.894815", + "step": 1660, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02064441703259945, + "timestamp": "2025-09-04 04:00:09.909470", + "step": 1661, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:00:10.012229", + "step": 1661, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015596513636410236, + "timestamp": "2025-09-04 04:00:10.031405", + "step": 1662, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:10.132215", + "step": 1662, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011088766157627106, + "timestamp": "2025-09-04 04:00:10.150840", + "step": 1663, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:00:10.273945", + "step": 1663, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012567834928631783, + "timestamp": "2025-09-04 04:00:10.295058", + "step": 1664, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:00:10.403246", + "step": 1664, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06560499221086502, + "timestamp": "2025-09-04 04:00:10.425808", + "step": 1665, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:10.519387", + "step": 1665, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012841533869504929, + "timestamp": "2025-09-04 04:00:10.536398", + "step": 1666, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:00:10.650045", + "step": 1666, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003418769920244813, + "timestamp": "2025-09-04 04:00:10.670606", + "step": 1667, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:10.766374", + "step": 1667, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02538418211042881, + "timestamp": "2025-09-04 04:00:10.784387", + "step": 1668, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:00:10.884595", + "step": 1668, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.08917974680662155, + "timestamp": "2025-09-04 04:00:10.905588", + "step": 1669, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:11.005904", + "step": 1669, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.052654486149549484, + "timestamp": "2025-09-04 04:00:11.024342", + "step": 1670, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:11.123525", + "step": 1670, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0234109815210104, + "timestamp": "2025-09-04 04:00:11.141882", + "step": 1671, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:00:11.229789", + "step": 1671, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03294301778078079, + "timestamp": "2025-09-04 04:00:11.245909", + "step": 1672, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:00:11.315897", + "step": 1672, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01805739291012287, + "timestamp": "2025-09-04 04:00:11.329916", + "step": 1673, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:11.434633", + "step": 1673, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01714208535850048, + "timestamp": "2025-09-04 04:00:11.453840", + "step": 1674, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 04:00:11.580861", + "step": 1674, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04294885694980621, + "timestamp": "2025-09-04 04:00:11.605060", + "step": 1675, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1040 + ], + "flops": 20800126336064.0 + }, + "timestamp": "2025-09-04 04:00:11.755808", + "step": 1675, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04310276731848717, + "timestamp": "2025-09-04 04:00:11.785965", + "step": 1676, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:00:11.886494", + "step": 1676, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03321857005357742, + "timestamp": "2025-09-04 04:00:11.907370", + "step": 1677, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:00:11.986545", + "step": 1677, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014885162003338337, + "timestamp": "2025-09-04 04:00:12.000644", + "step": 1678, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:12.102838", + "step": 1678, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00889945961534977, + "timestamp": "2025-09-04 04:00:12.121999", + "step": 1679, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:12.226444", + "step": 1679, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010784993646666408, + "timestamp": "2025-09-04 04:00:12.243714", + "step": 1680, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:00:20.689461", + "step": 1680, + "epoch": 2 + }, + { + "type": "pplx", + "content": 331.5873085585238, + "timestamp": "2025-09-04 04:00:20.691749", + "step": 1680, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1680", + "timestamp": "2025-09-04 04:00:21.056049", + "step": 1680, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:21.143665", + "step": 1680, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007037813309580088, + "timestamp": "2025-09-04 04:00:21.161779", + "step": 1681, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:21.272043", + "step": 1681, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006806929595768452, + "timestamp": "2025-09-04 04:00:21.292471", + "step": 1682, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:21.396953", + "step": 1682, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0040665543638169765, + "timestamp": "2025-09-04 04:00:21.416216", + "step": 1683, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:21.506686", + "step": 1683, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009878264740109444, + "timestamp": "2025-09-04 04:00:21.523945", + "step": 1684, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:21.615259", + "step": 1684, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016732510179281235, + "timestamp": "2025-09-04 04:00:21.634022", + "step": 1685, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:00:21.744343", + "step": 1685, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031748716719448566, + "timestamp": "2025-09-04 04:00:21.764249", + "step": 1686, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:21.856813", + "step": 1686, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04727555438876152, + "timestamp": "2025-09-04 04:00:21.874028", + "step": 1687, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:00:21.952413", + "step": 1687, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011287051253020763, + "timestamp": "2025-09-04 04:00:21.967393", + "step": 1688, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:22.067839", + "step": 1688, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012352891266345978, + "timestamp": "2025-09-04 04:00:22.088808", + "step": 1689, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:00:22.166671", + "step": 1689, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.041657764464616776, + "timestamp": "2025-09-04 04:00:22.180689", + "step": 1690, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:00:22.258826", + "step": 1690, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013096681796014309, + "timestamp": "2025-09-04 04:00:22.272579", + "step": 1691, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:00:22.359084", + "step": 1691, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010199989192187786, + "timestamp": "2025-09-04 04:00:22.375432", + "step": 1692, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:22.472488", + "step": 1692, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009935924783349037, + "timestamp": "2025-09-04 04:00:22.492845", + "step": 1693, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:22.586149", + "step": 1693, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010472126305103302, + "timestamp": "2025-09-04 04:00:22.603478", + "step": 1694, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:22.707303", + "step": 1694, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013101182878017426, + "timestamp": "2025-09-04 04:00:22.726713", + "step": 1695, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:00:22.838549", + "step": 1695, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00893877912312746, + "timestamp": "2025-09-04 04:00:22.859904", + "step": 1696, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:22.957935", + "step": 1696, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012058167718350887, + "timestamp": "2025-09-04 04:00:22.978574", + "step": 1697, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:23.069815", + "step": 1697, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03133353590965271, + "timestamp": "2025-09-04 04:00:23.086718", + "step": 1698, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:23.190329", + "step": 1698, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004476282745599747, + "timestamp": "2025-09-04 04:00:23.209733", + "step": 1699, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:23.309449", + "step": 1699, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025943979620933533, + "timestamp": "2025-09-04 04:00:23.328940", + "step": 1700, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:00:31.826010", + "step": 1700, + "epoch": 2 + }, + { + "type": "pplx", + "content": 333.02580516691586, + "timestamp": "2025-09-04 04:00:31.828237", + "step": 1700, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:00:31.926110", + "step": 1700, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017599618062376976, + "timestamp": "2025-09-04 04:00:31.947097", + "step": 1701, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:32.040262", + "step": 1701, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015532653778791428, + "timestamp": "2025-09-04 04:00:32.057510", + "step": 1702, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:00:32.143074", + "step": 1702, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019918687641620636, + "timestamp": "2025-09-04 04:00:32.158506", + "step": 1703, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:00:32.232768", + "step": 1703, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005274395924061537, + "timestamp": "2025-09-04 04:00:32.246605", + "step": 1704, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:32.337510", + "step": 1704, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01871996931731701, + "timestamp": "2025-09-04 04:00:32.356341", + "step": 1705, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:00:32.441434", + "step": 1705, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04257494583725929, + "timestamp": "2025-09-04 04:00:32.456936", + "step": 1706, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:32.552180", + "step": 1706, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020532267168164253, + "timestamp": "2025-09-04 04:00:32.569560", + "step": 1707, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:00:32.645022", + "step": 1707, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004226659890264273, + "timestamp": "2025-09-04 04:00:32.659438", + "step": 1708, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:32.766042", + "step": 1708, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008803565986454487, + "timestamp": "2025-09-04 04:00:32.788565", + "step": 1709, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:00:32.895625", + "step": 1709, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02005820721387863, + "timestamp": "2025-09-04 04:00:32.915733", + "step": 1710, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:33.006906", + "step": 1710, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.1214195266366005, + "timestamp": "2025-09-04 04:00:33.023783", + "step": 1711, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:33.126759", + "step": 1711, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.045690055936574936, + "timestamp": "2025-09-04 04:00:33.144887", + "step": 1712, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:00:33.259834", + "step": 1712, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035067156422883272, + "timestamp": "2025-09-04 04:00:33.284236", + "step": 1713, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:33.378429", + "step": 1713, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012944525107741356, + "timestamp": "2025-09-04 04:00:33.395663", + "step": 1714, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:33.501202", + "step": 1714, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0048626684583723545, + "timestamp": "2025-09-04 04:00:33.520594", + "step": 1715, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:00:33.598180", + "step": 1715, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01824316196143627, + "timestamp": "2025-09-04 04:00:33.613069", + "step": 1716, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:33.711266", + "step": 1716, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008359821513295174, + "timestamp": "2025-09-04 04:00:33.732074", + "step": 1717, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:00:33.810997", + "step": 1717, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.056332752108573914, + "timestamp": "2025-09-04 04:00:33.824856", + "step": 1718, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:00:33.959830", + "step": 1718, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0033087804913520813, + "timestamp": "2025-09-04 04:00:33.985444", + "step": 1719, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:34.087256", + "step": 1719, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023557273671031, + "timestamp": "2025-09-04 04:00:34.106749", + "step": 1720, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:00:42.599713", + "step": 1720, + "epoch": 2 + }, + { + "type": "pplx", + "content": 333.25088739723225, + "timestamp": "2025-09-04 04:00:42.601861", + "step": 1720, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1720", + "timestamp": "2025-09-04 04:00:42.959913", + "step": 1720, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:43.064965", + "step": 1720, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03474944829940796, + "timestamp": "2025-09-04 04:00:43.087465", + "step": 1721, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:43.198541", + "step": 1721, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005826589651405811, + "timestamp": "2025-09-04 04:00:43.218959", + "step": 1722, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:00:43.339501", + "step": 1722, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0146811967715621, + "timestamp": "2025-09-04 04:00:43.361337", + "step": 1723, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:00:43.445239", + "step": 1723, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027011031284928322, + "timestamp": "2025-09-04 04:00:43.461157", + "step": 1724, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:43.555904", + "step": 1724, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009984579868614674, + "timestamp": "2025-09-04 04:00:43.574597", + "step": 1725, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:43.674494", + "step": 1725, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03857394680380821, + "timestamp": "2025-09-04 04:00:43.693031", + "step": 1726, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:43.786677", + "step": 1726, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024763226974755526, + "timestamp": "2025-09-04 04:00:43.803892", + "step": 1727, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:00:43.913846", + "step": 1727, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02332633174955845, + "timestamp": "2025-09-04 04:00:43.935398", + "step": 1728, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:44.036422", + "step": 1728, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06381595879793167, + "timestamp": "2025-09-04 04:00:44.057407", + "step": 1729, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:44.153790", + "step": 1729, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0381832979619503, + "timestamp": "2025-09-04 04:00:44.171248", + "step": 1730, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:44.265710", + "step": 1730, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027034031227231026, + "timestamp": "2025-09-04 04:00:44.283200", + "step": 1731, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:44.378747", + "step": 1731, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022275667637586594, + "timestamp": "2025-09-04 04:00:44.397024", + "step": 1732, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:00:44.502743", + "step": 1732, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11091621220111847, + "timestamp": "2025-09-04 04:00:44.525023", + "step": 1733, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:44.636832", + "step": 1733, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019922046922147274, + "timestamp": "2025-09-04 04:00:44.657096", + "step": 1734, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:44.757668", + "step": 1734, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001954772276803851, + "timestamp": "2025-09-04 04:00:44.776302", + "step": 1735, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:44.879280", + "step": 1735, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0044233831577003, + "timestamp": "2025-09-04 04:00:44.898969", + "step": 1736, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:00:44.981562", + "step": 1736, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0020844575483351946, + "timestamp": "2025-09-04 04:00:44.998177", + "step": 1737, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:45.088833", + "step": 1737, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021219972521066666, + "timestamp": "2025-09-04 04:00:45.105675", + "step": 1738, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:45.200766", + "step": 1738, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035940525121986866, + "timestamp": "2025-09-04 04:00:45.218204", + "step": 1739, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:45.322209", + "step": 1739, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004270992241799831, + "timestamp": "2025-09-04 04:00:45.342196", + "step": 1740, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:00:53.844925", + "step": 1740, + "epoch": 2 + }, + { + "type": "pplx", + "content": 332.1656256388727, + "timestamp": "2025-09-04 04:00:53.848701", + "step": 1740, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:53.945874", + "step": 1740, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011263997294008732, + "timestamp": "2025-09-04 04:00:53.966713", + "step": 1741, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 04:00:54.186951", + "step": 1741, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0082445302978158, + "timestamp": "2025-09-04 04:00:54.229291", + "step": 1742, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:54.333877", + "step": 1742, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015566142275929451, + "timestamp": "2025-09-04 04:00:54.353256", + "step": 1743, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:54.446192", + "step": 1743, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007874183356761932, + "timestamp": "2025-09-04 04:00:54.464227", + "step": 1744, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:00:54.538554", + "step": 1744, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03222401440143585, + "timestamp": "2025-09-04 04:00:54.553386", + "step": 1745, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:54.665546", + "step": 1745, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005354198161512613, + "timestamp": "2025-09-04 04:00:54.686098", + "step": 1746, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:00:54.773362", + "step": 1746, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02252938598394394, + "timestamp": "2025-09-04 04:00:54.789032", + "step": 1747, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:54.880684", + "step": 1747, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016836240887641907, + "timestamp": "2025-09-04 04:00:54.898361", + "step": 1748, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:00:55.003472", + "step": 1748, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.060379642993211746, + "timestamp": "2025-09-04 04:00:55.025325", + "step": 1749, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:00:55.127851", + "step": 1749, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00034839441650547087, + "timestamp": "2025-09-04 04:00:55.147125", + "step": 1750, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:00:55.241660", + "step": 1750, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006921238731592894, + "timestamp": "2025-09-04 04:00:55.259134", + "step": 1751, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:00:55.355817", + "step": 1751, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024503259919583797, + "timestamp": "2025-09-04 04:00:55.373415", + "step": 1752, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:00:55.474861", + "step": 1752, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013867728412151337, + "timestamp": "2025-09-04 04:00:55.495862", + "step": 1753, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:55.593558", + "step": 1753, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012754159979522228, + "timestamp": "2025-09-04 04:00:55.611128", + "step": 1754, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:00:55.720755", + "step": 1754, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016128217801451683, + "timestamp": "2025-09-04 04:00:55.741196", + "step": 1755, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:00:55.832248", + "step": 1755, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002366261789575219, + "timestamp": "2025-09-04 04:00:55.849776", + "step": 1756, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:00:55.931287", + "step": 1756, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012999413534998894, + "timestamp": "2025-09-04 04:00:55.947967", + "step": 1757, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:00:56.042427", + "step": 1757, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011377224698662758, + "timestamp": "2025-09-04 04:00:56.059803", + "step": 1758, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:00:56.160358", + "step": 1758, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028042022138834, + "timestamp": "2025-09-04 04:00:56.179270", + "step": 1759, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:00:56.280616", + "step": 1759, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07089690864086151, + "timestamp": "2025-09-04 04:00:56.299883", + "step": 1760, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:01:04.797965", + "step": 1760, + "epoch": 2 + }, + { + "type": "pplx", + "content": 331.5460997167647, + "timestamp": "2025-09-04 04:01:04.799966", + "step": 1760, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1760", + "timestamp": "2025-09-04 04:01:05.165288", + "step": 1760, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:01:05.245129", + "step": 1760, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005329194013029337, + "timestamp": "2025-09-04 04:01:05.261502", + "step": 1761, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:05.367704", + "step": 1761, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019212173065170646, + "timestamp": "2025-09-04 04:01:05.387739", + "step": 1762, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:05.482905", + "step": 1762, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011046413332223892, + "timestamp": "2025-09-04 04:01:05.500152", + "step": 1763, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:01:05.603684", + "step": 1763, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02376765012741089, + "timestamp": "2025-09-04 04:01:05.623816", + "step": 1764, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:05.765786", + "step": 1764, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02195976860821247, + "timestamp": "2025-09-04 04:01:05.786601", + "step": 1765, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:01:05.872683", + "step": 1765, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06642767786979675, + "timestamp": "2025-09-04 04:01:05.887982", + "step": 1766, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:01:06.093441", + "step": 1766, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038195352535694838, + "timestamp": "2025-09-04 04:01:06.132739", + "step": 1767, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:01:06.245766", + "step": 1767, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026749003678560257, + "timestamp": "2025-09-04 04:01:06.267150", + "step": 1768, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:01:06.342459", + "step": 1768, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032609354704618454, + "timestamp": "2025-09-04 04:01:06.357861", + "step": 1769, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:01:06.472022", + "step": 1769, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02017885632812977, + "timestamp": "2025-09-04 04:01:06.492669", + "step": 1770, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:01:06.570069", + "step": 1770, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01600290834903717, + "timestamp": "2025-09-04 04:01:06.583952", + "step": 1771, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:01:06.656885", + "step": 1771, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0048013306222856045, + "timestamp": "2025-09-04 04:01:06.670727", + "step": 1772, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:01:06.792023", + "step": 1772, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011499549262225628, + "timestamp": "2025-09-04 04:01:06.817313", + "step": 1773, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:01:06.894357", + "step": 1773, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007216259371489286, + "timestamp": "2025-09-04 04:01:06.908347", + "step": 1774, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 04:01:07.041806", + "step": 1774, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.040591221302747726, + "timestamp": "2025-09-04 04:01:07.066436", + "step": 1775, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:07.165680", + "step": 1775, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030231038108468056, + "timestamp": "2025-09-04 04:01:07.185187", + "step": 1776, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:07.278526", + "step": 1776, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04355088993906975, + "timestamp": "2025-09-04 04:01:07.297576", + "step": 1777, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:07.390198", + "step": 1777, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010329993441700935, + "timestamp": "2025-09-04 04:01:07.407455", + "step": 1778, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:01:07.494403", + "step": 1778, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06678889691829681, + "timestamp": "2025-09-04 04:01:07.510066", + "step": 1779, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:07.603681", + "step": 1779, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03957043215632439, + "timestamp": "2025-09-04 04:01:07.621544", + "step": 1780, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:01:16.109115", + "step": 1780, + "epoch": 2 + }, + { + "type": "pplx", + "content": 332.86792094420156, + "timestamp": "2025-09-04 04:01:16.111280", + "step": 1780, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:01:16.214758", + "step": 1780, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02120288833975792, + "timestamp": "2025-09-04 04:01:16.237163", + "step": 1781, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:16.331514", + "step": 1781, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07391846925020218, + "timestamp": "2025-09-04 04:01:16.348990", + "step": 1782, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:01:16.454583", + "step": 1782, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003579053794965148, + "timestamp": "2025-09-04 04:01:16.473828", + "step": 1783, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:01:16.583998", + "step": 1783, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00399737898260355, + "timestamp": "2025-09-04 04:01:16.604981", + "step": 1784, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:01:16.709762", + "step": 1784, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06767401099205017, + "timestamp": "2025-09-04 04:01:16.732053", + "step": 1785, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:16.839280", + "step": 1785, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015294116921722889, + "timestamp": "2025-09-04 04:01:16.859246", + "step": 1786, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:16.966615", + "step": 1786, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.056523002684116364, + "timestamp": "2025-09-04 04:01:16.986545", + "step": 1787, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:17.093325", + "step": 1787, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031245263293385506, + "timestamp": "2025-09-04 04:01:17.114048", + "step": 1788, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:01:17.189344", + "step": 1788, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013243497349321842, + "timestamp": "2025-09-04 04:01:17.204416", + "step": 1789, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:01:17.286843", + "step": 1789, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008389891125261784, + "timestamp": "2025-09-04 04:01:17.302064", + "step": 1790, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:01:17.396893", + "step": 1790, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07410303503274918, + "timestamp": "2025-09-04 04:01:17.414517", + "step": 1791, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:01:17.492341", + "step": 1791, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04840013012290001, + "timestamp": "2025-09-04 04:01:17.507244", + "step": 1792, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:01:17.596637", + "step": 1792, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.033458348363637924, + "timestamp": "2025-09-04 04:01:17.615173", + "step": 1793, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:01:17.700214", + "step": 1793, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031342965085059404, + "timestamp": "2025-09-04 04:01:17.715288", + "step": 1794, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:01:17.823263", + "step": 1794, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004811432678252459, + "timestamp": "2025-09-04 04:01:17.843383", + "step": 1795, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:17.943652", + "step": 1795, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.060861192643642426, + "timestamp": "2025-09-04 04:01:17.963035", + "step": 1796, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:01:18.038096", + "step": 1796, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0050119198858737946, + "timestamp": "2025-09-04 04:01:18.053478", + "step": 1797, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:01:18.130769", + "step": 1797, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03676461800932884, + "timestamp": "2025-09-04 04:01:18.144495", + "step": 1798, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:01:18.220106", + "step": 1798, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014785559847950935, + "timestamp": "2025-09-04 04:01:18.233902", + "step": 1799, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:01:18.317658", + "step": 1799, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008279402740299702, + "timestamp": "2025-09-04 04:01:18.333743", + "step": 1800, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:01:26.804956", + "step": 1800, + "epoch": 2 + }, + { + "type": "pplx", + "content": 334.6393289394288, + "timestamp": "2025-09-04 04:01:26.807267", + "step": 1800, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1800", + "timestamp": "2025-09-04 04:01:27.163015", + "step": 1800, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:01:27.279145", + "step": 1800, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031706344336271286, + "timestamp": "2025-09-04 04:01:27.302897", + "step": 1801, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:01:27.406870", + "step": 1801, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010657178238034248, + "timestamp": "2025-09-04 04:01:27.426238", + "step": 1802, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:27.524874", + "step": 1802, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006731742527335882, + "timestamp": "2025-09-04 04:01:27.543242", + "step": 1803, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:01:27.654231", + "step": 1803, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010640786960721016, + "timestamp": "2025-09-04 04:01:27.675378", + "step": 1804, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:27.767989", + "step": 1804, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017573751974850893, + "timestamp": "2025-09-04 04:01:27.786871", + "step": 1805, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:01:27.861394", + "step": 1805, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030143826734274626, + "timestamp": "2025-09-04 04:01:27.874925", + "step": 1806, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:01:27.988082", + "step": 1806, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05524804815649986, + "timestamp": "2025-09-04 04:01:28.008638", + "step": 1807, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:01:28.111875", + "step": 1807, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0070555852726101875, + "timestamp": "2025-09-04 04:01:28.131855", + "step": 1808, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:28.237339", + "step": 1808, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0052083320915699005, + "timestamp": "2025-09-04 04:01:28.259221", + "step": 1809, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:01:28.335457", + "step": 1809, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006322692148387432, + "timestamp": "2025-09-04 04:01:28.348911", + "step": 1810, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:28.444431", + "step": 1810, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0009704896947368979, + "timestamp": "2025-09-04 04:01:28.461713", + "step": 1811, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:28.563516", + "step": 1811, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.032470621168613434, + "timestamp": "2025-09-04 04:01:28.582668", + "step": 1812, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:28.690265", + "step": 1812, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09668878465890884, + "timestamp": "2025-09-04 04:01:28.711902", + "step": 1813, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:01:28.825233", + "step": 1813, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.043325960636138916, + "timestamp": "2025-09-04 04:01:28.845811", + "step": 1814, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:01:28.919550", + "step": 1814, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03765127435326576, + "timestamp": "2025-09-04 04:01:28.932457", + "step": 1815, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:29.027274", + "step": 1815, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0037220031954348087, + "timestamp": "2025-09-04 04:01:29.045400", + "step": 1816, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:29.138382", + "step": 1816, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010741024278104305, + "timestamp": "2025-09-04 04:01:29.157429", + "step": 1817, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:01:29.260861", + "step": 1817, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008968895301222801, + "timestamp": "2025-09-04 04:01:29.279763", + "step": 1818, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:29.373806", + "step": 1818, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.044184453785419464, + "timestamp": "2025-09-04 04:01:29.390644", + "step": 1819, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:01:29.487551", + "step": 1819, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000747227284591645, + "timestamp": "2025-09-04 04:01:29.505764", + "step": 1820, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:01:37.982577", + "step": 1820, + "epoch": 2 + }, + { + "type": "pplx", + "content": 334.53241911357134, + "timestamp": "2025-09-04 04:01:37.984505", + "step": 1820, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:01:38.089414", + "step": 1820, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0014711732510477304, + "timestamp": "2025-09-04 04:01:38.112040", + "step": 1821, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:38.205609", + "step": 1821, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010439584963023663, + "timestamp": "2025-09-04 04:01:38.222944", + "step": 1822, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:38.323038", + "step": 1822, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012518877163529396, + "timestamp": "2025-09-04 04:01:38.341806", + "step": 1823, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:01:38.444098", + "step": 1823, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04167770966887474, + "timestamp": "2025-09-04 04:01:38.463964", + "step": 1824, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:38.555702", + "step": 1824, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005846615415066481, + "timestamp": "2025-09-04 04:01:38.574612", + "step": 1825, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:38.675901", + "step": 1825, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026406852528452873, + "timestamp": "2025-09-04 04:01:38.694683", + "step": 1826, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 04:01:38.758694", + "step": 1826, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015116888098418713, + "timestamp": "2025-09-04 04:01:38.769907", + "step": 1827, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:01:38.887154", + "step": 1827, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09362763166427612, + "timestamp": "2025-09-04 04:01:38.909974", + "step": 1828, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:01:38.999028", + "step": 1828, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000753458240069449, + "timestamp": "2025-09-04 04:01:39.017403", + "step": 1829, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:01:39.097095", + "step": 1829, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021402668207883835, + "timestamp": "2025-09-04 04:01:39.111346", + "step": 1830, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:39.213078", + "step": 1830, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005958658177405596, + "timestamp": "2025-09-04 04:01:39.231929", + "step": 1831, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:39.333890", + "step": 1831, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.051472008228302, + "timestamp": "2025-09-04 04:01:39.353219", + "step": 1832, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:01:39.437888", + "step": 1832, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030274610966444016, + "timestamp": "2025-09-04 04:01:39.454964", + "step": 1833, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:01:39.562661", + "step": 1833, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007288443390280008, + "timestamp": "2025-09-04 04:01:39.582323", + "step": 1834, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:39.676731", + "step": 1834, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006486637983471155, + "timestamp": "2025-09-04 04:01:39.693979", + "step": 1835, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:39.793975", + "step": 1835, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005186740309000015, + "timestamp": "2025-09-04 04:01:39.813264", + "step": 1836, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:39.904684", + "step": 1836, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004083861596882343, + "timestamp": "2025-09-04 04:01:39.923216", + "step": 1837, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:01:40.025797", + "step": 1837, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018782733008265495, + "timestamp": "2025-09-04 04:01:40.045011", + "step": 1838, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:01:40.121633", + "step": 1838, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004725904669612646, + "timestamp": "2025-09-04 04:01:40.135403", + "step": 1839, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:01:40.226335", + "step": 1839, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027214346453547478, + "timestamp": "2025-09-04 04:01:40.243999", + "step": 1840, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:01:48.732766", + "step": 1840, + "epoch": 2 + }, + { + "type": "pplx", + "content": 332.6223443151369, + "timestamp": "2025-09-04 04:01:48.735049", + "step": 1840, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1840", + "timestamp": "2025-09-04 04:01:49.155872", + "step": 1840, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:01:49.243365", + "step": 1840, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00866016000509262, + "timestamp": "2025-09-04 04:01:49.260285", + "step": 1841, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:01:49.383488", + "step": 1841, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02382933534681797, + "timestamp": "2025-09-04 04:01:49.406702", + "step": 1842, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:49.501505", + "step": 1842, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018402768298983574, + "timestamp": "2025-09-04 04:01:49.518813", + "step": 1843, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:01:49.615531", + "step": 1843, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0033138843718916178, + "timestamp": "2025-09-04 04:01:49.633725", + "step": 1844, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:01:49.765165", + "step": 1844, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014491533860564232, + "timestamp": "2025-09-04 04:01:49.793552", + "step": 1845, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:49.893752", + "step": 1845, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06931175291538239, + "timestamp": "2025-09-04 04:01:49.912401", + "step": 1846, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:50.006776", + "step": 1846, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04427943378686905, + "timestamp": "2025-09-04 04:01:50.023915", + "step": 1847, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:01:50.132323", + "step": 1847, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012512077577412128, + "timestamp": "2025-09-04 04:01:50.153223", + "step": 1848, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:50.245240", + "step": 1848, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009834478609263897, + "timestamp": "2025-09-04 04:01:50.263906", + "step": 1849, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:01:50.369649", + "step": 1849, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07198784500360489, + "timestamp": "2025-09-04 04:01:50.388693", + "step": 1850, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:01:50.499515", + "step": 1850, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031000154092907906, + "timestamp": "2025-09-04 04:01:50.519893", + "step": 1851, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:01:50.612703", + "step": 1851, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01771543361246586, + "timestamp": "2025-09-04 04:01:50.630038", + "step": 1852, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:01:50.751707", + "step": 1852, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009000863879919052, + "timestamp": "2025-09-04 04:01:50.777057", + "step": 1853, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:50.871081", + "step": 1853, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017614608630537987, + "timestamp": "2025-09-04 04:01:50.888216", + "step": 1854, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:01:50.989658", + "step": 1854, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0047741541638970375, + "timestamp": "2025-09-04 04:01:51.008530", + "step": 1855, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:01:51.099179", + "step": 1855, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029526762664318085, + "timestamp": "2025-09-04 04:01:51.116803", + "step": 1856, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:01:51.208159", + "step": 1856, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01259857602417469, + "timestamp": "2025-09-04 04:01:51.227097", + "step": 1857, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:01:51.305343", + "step": 1857, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02670917473733425, + "timestamp": "2025-09-04 04:01:51.318941", + "step": 1858, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:01:51.419359", + "step": 1858, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0215139277279377, + "timestamp": "2025-09-04 04:01:51.437922", + "step": 1859, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:01:51.523670", + "step": 1859, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013091914355754852, + "timestamp": "2025-09-04 04:01:51.540128", + "step": 1860, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:00.150071", + "step": 1860, + "epoch": 2 + }, + { + "type": "pplx", + "content": 328.99443454458276, + "timestamp": "2025-09-04 04:02:00.152296", + "step": 1860, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:02:00.234249", + "step": 1860, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021875588223338127, + "timestamp": "2025-09-04 04:02:00.250938", + "step": 1861, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:00.341251", + "step": 1861, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015288630966097116, + "timestamp": "2025-09-04 04:02:00.358039", + "step": 1862, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:00.458610", + "step": 1862, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03855356201529503, + "timestamp": "2025-09-04 04:02:00.477318", + "step": 1863, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:02:00.588810", + "step": 1863, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015515622682869434, + "timestamp": "2025-09-04 04:02:00.610288", + "step": 1864, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:00.699121", + "step": 1864, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020159801468253136, + "timestamp": "2025-09-04 04:02:00.717609", + "step": 1865, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:02:00.826565", + "step": 1865, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0021761921234428883, + "timestamp": "2025-09-04 04:02:00.846815", + "step": 1866, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:00.957794", + "step": 1866, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022660445421934128, + "timestamp": "2025-09-04 04:02:00.978402", + "step": 1867, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:02:01.062716", + "step": 1867, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004487188998609781, + "timestamp": "2025-09-04 04:02:01.078811", + "step": 1868, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:01.169919", + "step": 1868, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034519318491220474, + "timestamp": "2025-09-04 04:02:01.189116", + "step": 1869, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:01.300376", + "step": 1869, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0377497524023056, + "timestamp": "2025-09-04 04:02:01.320997", + "step": 1870, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:01.415723", + "step": 1870, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015238355845212936, + "timestamp": "2025-09-04 04:02:01.433073", + "step": 1871, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:01.535777", + "step": 1871, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012231401167809963, + "timestamp": "2025-09-04 04:02:01.555722", + "step": 1872, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:02:01.663789", + "step": 1872, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003201687941327691, + "timestamp": "2025-09-04 04:02:01.686618", + "step": 1873, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:01.791786", + "step": 1873, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024663355201482773, + "timestamp": "2025-09-04 04:02:01.811100", + "step": 1874, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:02:01.904041", + "step": 1874, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07429840415716171, + "timestamp": "2025-09-04 04:02:01.921296", + "step": 1875, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:02.021345", + "step": 1875, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015133678680285811, + "timestamp": "2025-09-04 04:02:02.040982", + "step": 1876, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:02.144440", + "step": 1876, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09027554839849472, + "timestamp": "2025-09-04 04:02:02.165531", + "step": 1877, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:02.265048", + "step": 1877, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029700566083192825, + "timestamp": "2025-09-04 04:02:02.283427", + "step": 1878, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:02:02.355323", + "step": 1878, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015838583931326866, + "timestamp": "2025-09-04 04:02:02.368341", + "step": 1879, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:02.463178", + "step": 1879, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0317532904446125, + "timestamp": "2025-09-04 04:02:02.481273", + "step": 1880, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:10.983366", + "step": 1880, + "epoch": 2 + }, + { + "type": "pplx", + "content": 326.9121941560908, + "timestamp": "2025-09-04 04:02:10.985583", + "step": 1880, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1880", + "timestamp": "2025-09-04 04:02:11.488696", + "step": 1880, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:11.586020", + "step": 1880, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006327355746179819, + "timestamp": "2025-09-04 04:02:11.606788", + "step": 1881, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:02:11.679312", + "step": 1881, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010608435841277242, + "timestamp": "2025-09-04 04:02:11.692168", + "step": 1882, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:11.787450", + "step": 1882, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012697882950305939, + "timestamp": "2025-09-04 04:02:11.804854", + "step": 1883, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:11.912598", + "step": 1883, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01790587045252323, + "timestamp": "2025-09-04 04:02:11.933353", + "step": 1884, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:12.035106", + "step": 1884, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0027732134331017733, + "timestamp": "2025-09-04 04:02:12.056040", + "step": 1885, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:02:12.134830", + "step": 1885, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.063087597489357, + "timestamp": "2025-09-04 04:02:12.148978", + "step": 1886, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:12.256224", + "step": 1886, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023315551225095987, + "timestamp": "2025-09-04 04:02:12.276242", + "step": 1887, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:12.383110", + "step": 1887, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011332008987665176, + "timestamp": "2025-09-04 04:02:12.403326", + "step": 1888, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:12.511780", + "step": 1888, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04571153596043587, + "timestamp": "2025-09-04 04:02:12.534517", + "step": 1889, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:02:12.617993", + "step": 1889, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05394493415951729, + "timestamp": "2025-09-04 04:02:12.633177", + "step": 1890, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:02:12.716784", + "step": 1890, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007151946425437927, + "timestamp": "2025-09-04 04:02:12.732026", + "step": 1891, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:12.832763", + "step": 1891, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03096769005060196, + "timestamp": "2025-09-04 04:02:12.852161", + "step": 1892, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:12.953361", + "step": 1892, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019417183473706245, + "timestamp": "2025-09-04 04:02:12.974116", + "step": 1893, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:02:13.084289", + "step": 1893, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007123048882931471, + "timestamp": "2025-09-04 04:02:13.104493", + "step": 1894, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:13.216318", + "step": 1894, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00961579754948616, + "timestamp": "2025-09-04 04:02:13.236746", + "step": 1895, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:13.339570", + "step": 1895, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01128938514739275, + "timestamp": "2025-09-04 04:02:13.359169", + "step": 1896, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:13.451654", + "step": 1896, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.036590978503227234, + "timestamp": "2025-09-04 04:02:13.470687", + "step": 1897, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:13.564383", + "step": 1897, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0244086105376482, + "timestamp": "2025-09-04 04:02:13.581880", + "step": 1898, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:13.683164", + "step": 1898, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014544663019478321, + "timestamp": "2025-09-04 04:02:13.701976", + "step": 1899, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:02:13.780322", + "step": 1899, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023842498660087585, + "timestamp": "2025-09-04 04:02:13.795174", + "step": 1900, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:22.378146", + "step": 1900, + "epoch": 2 + }, + { + "type": "pplx", + "content": 327.30657016075753, + "timestamp": "2025-09-04 04:02:22.380281", + "step": 1900, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:02:22.453274", + "step": 1900, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.036261361092329025, + "timestamp": "2025-09-04 04:02:22.467795", + "step": 1901, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:22.571889", + "step": 1901, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038794318679720163, + "timestamp": "2025-09-04 04:02:22.590900", + "step": 1902, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:22.697502", + "step": 1902, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00663451012223959, + "timestamp": "2025-09-04 04:02:22.715533", + "step": 1903, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:22.813576", + "step": 1903, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0043860250152647495, + "timestamp": "2025-09-04 04:02:22.831587", + "step": 1904, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:22.934235", + "step": 1904, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04079779237508774, + "timestamp": "2025-09-04 04:02:22.955046", + "step": 1905, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:23.051479", + "step": 1905, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011518475599586964, + "timestamp": "2025-09-04 04:02:23.068728", + "step": 1906, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:02:23.156438", + "step": 1906, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07115134596824646, + "timestamp": "2025-09-04 04:02:23.171339", + "step": 1907, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:23.267997", + "step": 1907, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006972004193812609, + "timestamp": "2025-09-04 04:02:23.286112", + "step": 1908, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:02:23.405084", + "step": 1908, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02820487506687641, + "timestamp": "2025-09-04 04:02:23.428196", + "step": 1909, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:23.533819", + "step": 1909, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021369347348809242, + "timestamp": "2025-09-04 04:02:23.552962", + "step": 1910, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:23.655938", + "step": 1910, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031066900119185448, + "timestamp": "2025-09-04 04:02:23.673941", + "step": 1911, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:02:23.753399", + "step": 1911, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007914647459983826, + "timestamp": "2025-09-04 04:02:23.767444", + "step": 1912, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:02:23.850430", + "step": 1912, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009337898343801498, + "timestamp": "2025-09-04 04:02:23.866235", + "step": 1913, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:02:23.951961", + "step": 1913, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015139404684305191, + "timestamp": "2025-09-04 04:02:23.966375", + "step": 1914, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:24.072580", + "step": 1914, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020058702677488327, + "timestamp": "2025-09-04 04:02:24.091184", + "step": 1915, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:24.184250", + "step": 1915, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01975717768073082, + "timestamp": "2025-09-04 04:02:24.201038", + "step": 1916, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:24.301071", + "step": 1916, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03391404077410698, + "timestamp": "2025-09-04 04:02:24.321252", + "step": 1917, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:02:24.447433", + "step": 1917, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02063564583659172, + "timestamp": "2025-09-04 04:02:24.470101", + "step": 1918, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:24.576203", + "step": 1918, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0002946726162917912, + "timestamp": "2025-09-04 04:02:24.595051", + "step": 1919, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:02:24.719215", + "step": 1919, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005154923070222139, + "timestamp": "2025-09-04 04:02:24.742337", + "step": 1920, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:33.304746", + "step": 1920, + "epoch": 2 + }, + { + "type": "pplx", + "content": 328.53727900122976, + "timestamp": "2025-09-04 04:02:33.308168", + "step": 1920, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1920", + "timestamp": "2025-09-04 04:02:33.808373", + "step": 1920, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:02:33.892978", + "step": 1920, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.049568939954042435, + "timestamp": "2025-09-04 04:02:33.910033", + "step": 1921, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:34.011859", + "step": 1921, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03862825781106949, + "timestamp": "2025-09-04 04:02:34.030696", + "step": 1922, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:02:34.108930", + "step": 1922, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003806066932156682, + "timestamp": "2025-09-04 04:02:34.123154", + "step": 1923, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:02:34.216966", + "step": 1923, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020454682409763336, + "timestamp": "2025-09-04 04:02:34.234930", + "step": 1924, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:34.350041", + "step": 1924, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.056430667638778687, + "timestamp": "2025-09-04 04:02:34.369848", + "step": 1925, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:34.501762", + "step": 1925, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007279661018401384, + "timestamp": "2025-09-04 04:02:34.520863", + "step": 1926, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:34.658902", + "step": 1926, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011289707385003567, + "timestamp": "2025-09-04 04:02:34.676419", + "step": 1927, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:34.772220", + "step": 1927, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0045151012018322945, + "timestamp": "2025-09-04 04:02:34.790651", + "step": 1928, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:34.879643", + "step": 1928, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016711879521608353, + "timestamp": "2025-09-04 04:02:34.897944", + "step": 1929, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:02:34.977184", + "step": 1929, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011652039363980293, + "timestamp": "2025-09-04 04:02:34.991213", + "step": 1930, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:35.084798", + "step": 1930, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00244183954782784, + "timestamp": "2025-09-04 04:02:35.102275", + "step": 1931, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:02:35.181891", + "step": 1931, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004456162918359041, + "timestamp": "2025-09-04 04:02:35.196813", + "step": 1932, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:02:35.294174", + "step": 1932, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018451880663633347, + "timestamp": "2025-09-04 04:02:35.313037", + "step": 1933, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:35.414679", + "step": 1933, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011028347536921501, + "timestamp": "2025-09-04 04:02:35.434011", + "step": 1934, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:02:35.571788", + "step": 1934, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0028660639654845, + "timestamp": "2025-09-04 04:02:35.598124", + "step": 1935, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:35.698343", + "step": 1935, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03565828502178192, + "timestamp": "2025-09-04 04:02:35.718091", + "step": 1936, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:35.818676", + "step": 1936, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.10143911838531494, + "timestamp": "2025-09-04 04:02:35.839976", + "step": 1937, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:35.940531", + "step": 1937, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01908188872039318, + "timestamp": "2025-09-04 04:02:35.959130", + "step": 1938, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:36.063139", + "step": 1938, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018672922626137733, + "timestamp": "2025-09-04 04:02:36.082502", + "step": 1939, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:36.186225", + "step": 1939, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002945462241768837, + "timestamp": "2025-09-04 04:02:36.206294", + "step": 1940, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:44.692002", + "step": 1940, + "epoch": 2 + }, + { + "type": "pplx", + "content": 328.19710857649045, + "timestamp": "2025-09-04 04:02:44.693795", + "step": 1940, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:02:44.760706", + "step": 1940, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006576020736247301, + "timestamp": "2025-09-04 04:02:44.774394", + "step": 1941, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:02:44.852530", + "step": 1941, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0037395476829260588, + "timestamp": "2025-09-04 04:02:44.866793", + "step": 1942, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:44.966761", + "step": 1942, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008964164182543755, + "timestamp": "2025-09-04 04:02:44.985466", + "step": 1943, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:45.085746", + "step": 1943, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010037235915660858, + "timestamp": "2025-09-04 04:02:45.105491", + "step": 1944, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:45.204877", + "step": 1944, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03502393513917923, + "timestamp": "2025-09-04 04:02:45.226011", + "step": 1945, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:45.335742", + "step": 1945, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007905172184109688, + "timestamp": "2025-09-04 04:02:45.356347", + "step": 1946, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:45.452352", + "step": 1946, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023761700838804245, + "timestamp": "2025-09-04 04:02:45.470038", + "step": 1947, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:02:45.579975", + "step": 1947, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012213548179715872, + "timestamp": "2025-09-04 04:02:45.601147", + "step": 1948, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:45.693335", + "step": 1948, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0033678747713565826, + "timestamp": "2025-09-04 04:02:45.712436", + "step": 1949, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:45.807324", + "step": 1949, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012253863969817758, + "timestamp": "2025-09-04 04:02:45.824862", + "step": 1950, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:45.933914", + "step": 1950, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004889811389148235, + "timestamp": "2025-09-04 04:02:45.954376", + "step": 1951, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:46.055612", + "step": 1951, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025324570015072823, + "timestamp": "2025-09-04 04:02:46.074986", + "step": 1952, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:46.167876", + "step": 1952, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006361103150993586, + "timestamp": "2025-09-04 04:02:46.186888", + "step": 1953, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:02:46.288522", + "step": 1953, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019413141533732414, + "timestamp": "2025-09-04 04:02:46.307129", + "step": 1954, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:46.409295", + "step": 1954, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005714345257729292, + "timestamp": "2025-09-04 04:02:46.426613", + "step": 1955, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:02:46.513059", + "step": 1955, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.056890930980443954, + "timestamp": "2025-09-04 04:02:46.529361", + "step": 1956, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:02:46.603085", + "step": 1956, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.049041591584682465, + "timestamp": "2025-09-04 04:02:46.618188", + "step": 1957, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:02:46.728342", + "step": 1957, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021011320874094963, + "timestamp": "2025-09-04 04:02:46.748819", + "step": 1958, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:46.843211", + "step": 1958, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004129297100007534, + "timestamp": "2025-09-04 04:02:46.860665", + "step": 1959, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:46.967025", + "step": 1959, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0063214851543307304, + "timestamp": "2025-09-04 04:02:46.987480", + "step": 1960, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:02:55.500629", + "step": 1960, + "epoch": 2 + }, + { + "type": "pplx", + "content": 328.783539244894, + "timestamp": "2025-09-04 04:02:55.502583", + "step": 1960, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 1960", + "timestamp": "2025-09-04 04:02:56.015154", + "step": 1960, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:56.116017", + "step": 1960, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02819206565618515, + "timestamp": "2025-09-04 04:02:56.137065", + "step": 1961, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:02:56.227546", + "step": 1961, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0029589931946247816, + "timestamp": "2025-09-04 04:02:56.244287", + "step": 1962, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:56.343431", + "step": 1962, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01074980664998293, + "timestamp": "2025-09-04 04:02:56.362178", + "step": 1963, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:02:56.461841", + "step": 1963, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010816739872097969, + "timestamp": "2025-09-04 04:02:56.481319", + "step": 1964, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:02:56.585680", + "step": 1964, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008484927006065845, + "timestamp": "2025-09-04 04:02:56.607894", + "step": 1965, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:56.704314", + "step": 1965, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018219899386167526, + "timestamp": "2025-09-04 04:02:56.721834", + "step": 1966, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:02:56.815874", + "step": 1966, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011172082275152206, + "timestamp": "2025-09-04 04:02:56.833246", + "step": 1967, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:02:56.941573", + "step": 1967, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015450472943484783, + "timestamp": "2025-09-04 04:02:56.962671", + "step": 1968, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:57.066734", + "step": 1968, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.036945831030607224, + "timestamp": "2025-09-04 04:02:57.088685", + "step": 1969, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:02:57.160350", + "step": 1969, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0042557851411402225, + "timestamp": "2025-09-04 04:02:57.173388", + "step": 1970, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:02:57.258959", + "step": 1970, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019967155531048775, + "timestamp": "2025-09-04 04:02:57.274551", + "step": 1971, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:57.381802", + "step": 1971, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012040197849273682, + "timestamp": "2025-09-04 04:02:57.402724", + "step": 1972, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:02:57.493730", + "step": 1972, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01980067417025566, + "timestamp": "2025-09-04 04:02:57.512481", + "step": 1973, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:02:57.605693", + "step": 1973, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008740060031414032, + "timestamp": "2025-09-04 04:02:57.622984", + "step": 1974, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:02:57.727696", + "step": 1974, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005561890080571175, + "timestamp": "2025-09-04 04:02:57.747152", + "step": 1975, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:02:57.843200", + "step": 1975, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026673782616853714, + "timestamp": "2025-09-04 04:02:57.861480", + "step": 1976, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:02:57.964421", + "step": 1976, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06876686215400696, + "timestamp": "2025-09-04 04:02:57.986466", + "step": 1977, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 04:02:58.148841", + "step": 1977, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005590865388512611, + "timestamp": "2025-09-04 04:02:58.180782", + "step": 1978, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:02:58.264369", + "step": 1978, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004839139059185982, + "timestamp": "2025-09-04 04:02:58.279672", + "step": 1979, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:02:58.383284", + "step": 1979, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02411733940243721, + "timestamp": "2025-09-04 04:02:58.403072", + "step": 1980, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:03:06.910649", + "step": 1980, + "epoch": 2 + }, + { + "type": "pplx", + "content": 331.13313135590505, + "timestamp": "2025-09-04 04:03:06.913387", + "step": 1980, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:03:07.009201", + "step": 1980, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.039461344480514526, + "timestamp": "2025-09-04 04:03:07.029715", + "step": 1981, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:07.132787", + "step": 1981, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013207260519266129, + "timestamp": "2025-09-04 04:03:07.151959", + "step": 1982, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:07.256472", + "step": 1982, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029344527050852776, + "timestamp": "2025-09-04 04:03:07.275850", + "step": 1983, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:07.380127", + "step": 1983, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0013410469982773066, + "timestamp": "2025-09-04 04:03:07.400223", + "step": 1984, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:03:07.491158", + "step": 1984, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01028304360806942, + "timestamp": "2025-09-04 04:03:07.510046", + "step": 1985, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:03:07.627621", + "step": 1985, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023101340979337692, + "timestamp": "2025-09-04 04:03:07.649726", + "step": 1986, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:07.741169", + "step": 1986, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012508483603596687, + "timestamp": "2025-09-04 04:03:07.757857", + "step": 1987, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:03:07.857215", + "step": 1987, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00880645215511322, + "timestamp": "2025-09-04 04:03:07.875535", + "step": 1988, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:03:07.992983", + "step": 1988, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00218992680311203, + "timestamp": "2025-09-04 04:03:08.016938", + "step": 1989, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:08.124746", + "step": 1989, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05591468885540962, + "timestamp": "2025-09-04 04:03:08.144878", + "step": 1990, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:08.223275", + "step": 1990, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011654703877866268, + "timestamp": "2025-09-04 04:03:08.237515", + "step": 1991, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:03:08.320578", + "step": 1991, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012211965397000313, + "timestamp": "2025-09-04 04:03:08.336518", + "step": 1992, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:03:08.418294", + "step": 1992, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006080134306102991, + "timestamp": "2025-09-04 04:03:08.435040", + "step": 1993, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:08.541873", + "step": 1993, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031754986848682165, + "timestamp": "2025-09-04 04:03:08.561950", + "step": 1994, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:03:08.697575", + "step": 1994, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022798430174589157, + "timestamp": "2025-09-04 04:03:08.723260", + "step": 1995, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:03:08.823733", + "step": 1995, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.055928491055965424, + "timestamp": "2025-09-04 04:03:08.840166", + "step": 1996, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:03:08.913556", + "step": 1996, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006855425424873829, + "timestamp": "2025-09-04 04:03:08.928424", + "step": 1997, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:03:09.000715", + "step": 1997, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022756323218345642, + "timestamp": "2025-09-04 04:03:09.013661", + "step": 1998, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:03:09.108477", + "step": 1998, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01630261540412903, + "timestamp": "2025-09-04 04:03:09.125996", + "step": 1999, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:03:09.219144", + "step": 1999, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00793174933642149, + "timestamp": "2025-09-04 04:03:09.237199", + "step": 2000, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:03:17.717901", + "step": 2000, + "epoch": 2 + }, + { + "type": "pplx", + "content": 337.7910185199579, + "timestamp": "2025-09-04 04:03:17.719822", + "step": 2000, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2000", + "timestamp": "2025-09-04 04:03:18.081079", + "step": 2000, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:18.183814", + "step": 2000, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038866132963448763, + "timestamp": "2025-09-04 04:03:18.205004", + "step": 2001, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:18.311859", + "step": 2001, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029996544122695923, + "timestamp": "2025-09-04 04:03:18.331766", + "step": 2002, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:18.412991", + "step": 2002, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029153253883123398, + "timestamp": "2025-09-04 04:03:18.426980", + "step": 2003, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:18.519253", + "step": 2003, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.059160567820072174, + "timestamp": "2025-09-04 04:03:18.536705", + "step": 2004, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:18.636731", + "step": 2004, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02167946845293045, + "timestamp": "2025-09-04 04:03:18.657389", + "step": 2005, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:18.761805", + "step": 2005, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0188386719673872, + "timestamp": "2025-09-04 04:03:18.780867", + "step": 2006, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:18.887731", + "step": 2006, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005847454303875566, + "timestamp": "2025-09-04 04:03:18.906473", + "step": 2007, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:18.987245", + "step": 2007, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008029515855014324, + "timestamp": "2025-09-04 04:03:19.002000", + "step": 2008, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:03:19.077244", + "step": 2008, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007086843717843294, + "timestamp": "2025-09-04 04:03:19.091979", + "step": 2009, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:19.214193", + "step": 2009, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008636104874312878, + "timestamp": "2025-09-04 04:03:19.233921", + "step": 2010, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:19.346341", + "step": 2010, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00841162633150816, + "timestamp": "2025-09-04 04:03:19.366348", + "step": 2011, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:19.478086", + "step": 2011, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005192655138671398, + "timestamp": "2025-09-04 04:03:19.499294", + "step": 2012, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:03:19.584853", + "step": 2012, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016467098146677017, + "timestamp": "2025-09-04 04:03:19.601656", + "step": 2013, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:19.714091", + "step": 2013, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004678299650549889, + "timestamp": "2025-09-04 04:03:19.734578", + "step": 2014, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:19.829432", + "step": 2014, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038956332951784134, + "timestamp": "2025-09-04 04:03:19.846129", + "step": 2015, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:03:19.946809", + "step": 2015, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013382869772613049, + "timestamp": "2025-09-04 04:03:19.966059", + "step": 2016, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:20.056506", + "step": 2016, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006615063641220331, + "timestamp": "2025-09-04 04:03:20.074773", + "step": 2017, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:03:20.154212", + "step": 2017, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035883184522390366, + "timestamp": "2025-09-04 04:03:20.168191", + "step": 2018, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:20.272520", + "step": 2018, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023406874388456345, + "timestamp": "2025-09-04 04:03:20.291716", + "step": 2019, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:20.383383", + "step": 2019, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09516555815935135, + "timestamp": "2025-09-04 04:03:20.400858", + "step": 2020, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:03:28.856700", + "step": 2020, + "epoch": 2 + }, + { + "type": "pplx", + "content": 339.743606509487, + "timestamp": "2025-09-04 04:03:28.858645", + "step": 2020, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:03:28.975803", + "step": 2020, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0009116530418395996, + "timestamp": "2025-09-04 04:03:29.001353", + "step": 2021, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:03:29.087980", + "step": 2021, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017411328153684735, + "timestamp": "2025-09-04 04:03:29.103549", + "step": 2022, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:03:29.198141", + "step": 2022, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018314287066459656, + "timestamp": "2025-09-04 04:03:29.215622", + "step": 2023, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:03:29.316793", + "step": 2023, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004151246044784784, + "timestamp": "2025-09-04 04:03:29.336267", + "step": 2024, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:29.412528", + "step": 2024, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03218241408467293, + "timestamp": "2025-09-04 04:03:29.428024", + "step": 2025, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:29.532196", + "step": 2025, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015404434874653816, + "timestamp": "2025-09-04 04:03:29.551399", + "step": 2026, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:03:29.627359", + "step": 2026, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.057021476328372955, + "timestamp": "2025-09-04 04:03:29.641204", + "step": 2027, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:29.746873", + "step": 2027, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008856714703142643, + "timestamp": "2025-09-04 04:03:29.767001", + "step": 2028, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:03:29.841846", + "step": 2028, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022210262715816498, + "timestamp": "2025-09-04 04:03:29.857048", + "step": 2029, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:29.967779", + "step": 2029, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005982580129057169, + "timestamp": "2025-09-04 04:03:29.988354", + "step": 2030, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:03:30.074993", + "step": 2030, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008112862706184387, + "timestamp": "2025-09-04 04:03:30.090723", + "step": 2031, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 04:03:30.215707", + "step": 2031, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02068488672375679, + "timestamp": "2025-09-04 04:03:30.240475", + "step": 2032, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:03:30.331022", + "step": 2032, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03741706535220146, + "timestamp": "2025-09-04 04:03:30.349873", + "step": 2033, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:03:30.432706", + "step": 2033, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05829369276762009, + "timestamp": "2025-09-04 04:03:30.447994", + "step": 2034, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:03:30.540942", + "step": 2034, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008479294367134571, + "timestamp": "2025-09-04 04:03:30.558435", + "step": 2035, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:30.668268", + "step": 2035, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04389806091785431, + "timestamp": "2025-09-04 04:03:30.689495", + "step": 2036, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:30.787055", + "step": 2036, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02469690330326557, + "timestamp": "2025-09-04 04:03:30.807747", + "step": 2037, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:30.911430", + "step": 2037, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02199394628405571, + "timestamp": "2025-09-04 04:03:30.930735", + "step": 2038, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:31.033799", + "step": 2038, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0037276356015354395, + "timestamp": "2025-09-04 04:03:31.052730", + "step": 2039, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:31.131633", + "step": 2039, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021283473819494247, + "timestamp": "2025-09-04 04:03:31.146545", + "step": 2040, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:03:39.511875", + "step": 2040, + "epoch": 2 + }, + { + "type": "pplx", + "content": 334.85919534739503, + "timestamp": "2025-09-04 04:03:39.513963", + "step": 2040, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2040", + "timestamp": "2025-09-04 04:03:40.015188", + "step": 2040, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:40.116989", + "step": 2040, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02683926559984684, + "timestamp": "2025-09-04 04:03:40.138785", + "step": 2041, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:40.248182", + "step": 2041, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0386735163629055, + "timestamp": "2025-09-04 04:03:40.268736", + "step": 2042, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:40.369113", + "step": 2042, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00885379035025835, + "timestamp": "2025-09-04 04:03:40.387946", + "step": 2043, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:03:40.484312", + "step": 2043, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006810452789068222, + "timestamp": "2025-09-04 04:03:40.502525", + "step": 2044, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:40.590982", + "step": 2044, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022753320634365082, + "timestamp": "2025-09-04 04:03:40.609346", + "step": 2045, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:03:40.684482", + "step": 2045, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.051130082458257675, + "timestamp": "2025-09-04 04:03:40.698277", + "step": 2046, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:03:40.806236", + "step": 2046, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028301551938056946, + "timestamp": "2025-09-04 04:03:40.826395", + "step": 2047, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:03:40.942751", + "step": 2047, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00401369109749794, + "timestamp": "2025-09-04 04:03:40.965669", + "step": 2048, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:03:41.073652", + "step": 2048, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008323295041918755, + "timestamp": "2025-09-04 04:03:41.096381", + "step": 2049, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:03:41.172951", + "step": 2049, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010393408127129078, + "timestamp": "2025-09-04 04:03:41.186961", + "step": 2050, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:41.276328", + "step": 2050, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038162292912602425, + "timestamp": "2025-09-04 04:03:41.293090", + "step": 2051, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:03:41.402980", + "step": 2051, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0021734382025897503, + "timestamp": "2025-09-04 04:03:41.424184", + "step": 2052, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:41.522034", + "step": 2052, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.036829832941293716, + "timestamp": "2025-09-04 04:03:41.542779", + "step": 2053, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:03:41.650754", + "step": 2053, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015699857845902443, + "timestamp": "2025-09-04 04:03:41.670987", + "step": 2054, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:03:41.754688", + "step": 2054, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014599669724702835, + "timestamp": "2025-09-04 04:03:41.769707", + "step": 2055, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:41.877413", + "step": 2055, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029209831729531288, + "timestamp": "2025-09-04 04:03:41.898633", + "step": 2056, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:03:41.974188", + "step": 2056, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022812718525528908, + "timestamp": "2025-09-04 04:03:41.989608", + "step": 2057, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:42.092066", + "step": 2057, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013986658304929733, + "timestamp": "2025-09-04 04:03:42.111323", + "step": 2058, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:03:42.199588", + "step": 2058, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.055983904749155045, + "timestamp": "2025-09-04 04:03:42.215227", + "step": 2059, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:03:42.319945", + "step": 2059, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007262484170496464, + "timestamp": "2025-09-04 04:03:42.339917", + "step": 2060, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:03:50.711637", + "step": 2060, + "epoch": 2 + }, + { + "type": "pplx", + "content": 326.4215391255511, + "timestamp": "2025-09-04 04:03:50.713840", + "step": 2060, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:03:50.791986", + "step": 2060, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009184667840600014, + "timestamp": "2025-09-04 04:03:50.808350", + "step": 2061, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:50.919805", + "step": 2061, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07893642038106918, + "timestamp": "2025-09-04 04:03:50.940465", + "step": 2062, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:03:51.033591", + "step": 2062, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008863512426614761, + "timestamp": "2025-09-04 04:03:51.050975", + "step": 2063, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:51.151163", + "step": 2063, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007501260843127966, + "timestamp": "2025-09-04 04:03:51.170850", + "step": 2064, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:03:51.267183", + "step": 2064, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035026571713387966, + "timestamp": "2025-09-04 04:03:51.287661", + "step": 2065, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:03:51.391160", + "step": 2065, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010278237983584404, + "timestamp": "2025-09-04 04:03:51.408252", + "step": 2066, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:51.507168", + "step": 2066, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.042156293988227844, + "timestamp": "2025-09-04 04:03:51.526171", + "step": 2067, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:03:51.627938", + "step": 2067, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024234874173998833, + "timestamp": "2025-09-04 04:03:51.647756", + "step": 2068, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:03:51.752621", + "step": 2068, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024930352810770273, + "timestamp": "2025-09-04 04:03:51.775261", + "step": 2069, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:03:51.883717", + "step": 2069, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.043505195528268814, + "timestamp": "2025-09-04 04:03:51.903987", + "step": 2070, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:52.014076", + "step": 2070, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006645853631198406, + "timestamp": "2025-09-04 04:03:52.033063", + "step": 2071, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:03:52.104148", + "step": 2071, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02741372399032116, + "timestamp": "2025-09-04 04:03:52.117669", + "step": 2072, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:03:52.210557", + "step": 2072, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0004673259099945426, + "timestamp": "2025-09-04 04:03:52.229709", + "step": 2073, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:03:52.335470", + "step": 2073, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023968873545527458, + "timestamp": "2025-09-04 04:03:52.355570", + "step": 2074, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:03:52.490754", + "step": 2074, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011759743094444275, + "timestamp": "2025-09-04 04:03:52.517067", + "step": 2075, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:03:52.629680", + "step": 2075, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002521298360079527, + "timestamp": "2025-09-04 04:03:52.651213", + "step": 2076, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:03:52.733607", + "step": 2076, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019572464749217033, + "timestamp": "2025-09-04 04:03:52.750683", + "step": 2077, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:03:52.840487", + "step": 2077, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002779679372906685, + "timestamp": "2025-09-04 04:03:52.857245", + "step": 2078, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:52.957934", + "step": 2078, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026165010407567024, + "timestamp": "2025-09-04 04:03:52.976685", + "step": 2079, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:03:53.076614", + "step": 2079, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004176696762442589, + "timestamp": "2025-09-04 04:03:53.096246", + "step": 2080, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:01.472371", + "step": 2080, + "epoch": 2 + }, + { + "type": "pplx", + "content": 320.5430643577628, + "timestamp": "2025-09-04 04:04:01.474232", + "step": 2080, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2080", + "timestamp": "2025-09-04 04:04:01.816278", + "step": 2080, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:04:01.899311", + "step": 2080, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0009140381007455289, + "timestamp": "2025-09-04 04:04:01.916437", + "step": 2081, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:02.017975", + "step": 2081, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035500312224030495, + "timestamp": "2025-09-04 04:04:02.036771", + "step": 2082, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:04:02.123421", + "step": 2082, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06194588169455528, + "timestamp": "2025-09-04 04:04:02.139060", + "step": 2083, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:02.234678", + "step": 2083, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.08327718824148178, + "timestamp": "2025-09-04 04:04:02.252937", + "step": 2084, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:04:02.367288", + "step": 2084, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0026233464013785124, + "timestamp": "2025-09-04 04:04:02.391497", + "step": 2085, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:02.491052", + "step": 2085, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.13432270288467407, + "timestamp": "2025-09-04 04:04:02.509568", + "step": 2086, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:04:02.595298", + "step": 2086, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026126880198717117, + "timestamp": "2025-09-04 04:04:02.610922", + "step": 2087, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:04:02.732732", + "step": 2087, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001551034045405686, + "timestamp": "2025-09-04 04:04:02.756661", + "step": 2088, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:02.857941", + "step": 2088, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002250077435746789, + "timestamp": "2025-09-04 04:04:02.879175", + "step": 2089, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:02.980040", + "step": 2089, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03148679807782173, + "timestamp": "2025-09-04 04:04:02.998944", + "step": 2090, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:03.093272", + "step": 2090, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01184455119073391, + "timestamp": "2025-09-04 04:04:03.110673", + "step": 2091, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:03.211623", + "step": 2091, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0011833092430606484, + "timestamp": "2025-09-04 04:04:03.231212", + "step": 2092, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:03.324401", + "step": 2092, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003595761489123106, + "timestamp": "2025-09-04 04:04:03.343596", + "step": 2093, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:03.437989", + "step": 2093, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027849143370985985, + "timestamp": "2025-09-04 04:04:03.455117", + "step": 2094, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:03.556386", + "step": 2094, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022928999736905098, + "timestamp": "2025-09-04 04:04:03.575190", + "step": 2095, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:03.668150", + "step": 2095, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07359001040458679, + "timestamp": "2025-09-04 04:04:03.686058", + "step": 2096, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:03.789756", + "step": 2096, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05725327506661415, + "timestamp": "2025-09-04 04:04:03.811698", + "step": 2097, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:03.915279", + "step": 2097, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04447811469435692, + "timestamp": "2025-09-04 04:04:03.934528", + "step": 2098, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:04.039378", + "step": 2098, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025624537374824286, + "timestamp": "2025-09-04 04:04:04.057979", + "step": 2099, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:04.158399", + "step": 2099, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035990336909890175, + "timestamp": "2025-09-04 04:04:04.178016", + "step": 2100, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:12.532858", + "step": 2100, + "epoch": 2 + }, + { + "type": "pplx", + "content": 314.23096938736927, + "timestamp": "2025-09-04 04:04:12.534871", + "step": 2100, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:12.620879", + "step": 2100, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003338422393426299, + "timestamp": "2025-09-04 04:04:12.639036", + "step": 2101, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:04:12.731933", + "step": 2101, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002954112831503153, + "timestamp": "2025-09-04 04:04:12.749327", + "step": 2102, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:12.850749", + "step": 2102, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009453012607991695, + "timestamp": "2025-09-04 04:04:12.869799", + "step": 2103, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:04:13.007718", + "step": 2103, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020522311329841614, + "timestamp": "2025-09-04 04:04:13.034524", + "step": 2104, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:13.121973", + "step": 2104, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024143625050783157, + "timestamp": "2025-09-04 04:04:13.140329", + "step": 2105, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:13.232648", + "step": 2105, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00796930119395256, + "timestamp": "2025-09-04 04:04:13.249377", + "step": 2106, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:13.351215", + "step": 2106, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000632771581877023, + "timestamp": "2025-09-04 04:04:13.370206", + "step": 2107, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:13.460383", + "step": 2107, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002220430178567767, + "timestamp": "2025-09-04 04:04:13.478003", + "step": 2108, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:13.583120", + "step": 2108, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01635553501546383, + "timestamp": "2025-09-04 04:04:13.604209", + "step": 2109, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:13.710252", + "step": 2109, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017017148435115814, + "timestamp": "2025-09-04 04:04:13.729384", + "step": 2110, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:04:13.806648", + "step": 2110, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019480792107060552, + "timestamp": "2025-09-04 04:04:13.820438", + "step": 2111, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:13.929794", + "step": 2111, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012161768041551113, + "timestamp": "2025-09-04 04:04:13.950904", + "step": 2112, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:14.049070", + "step": 2112, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.048178572207689285, + "timestamp": "2025-09-04 04:04:14.069251", + "step": 2113, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:14.173919", + "step": 2113, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003165569854900241, + "timestamp": "2025-09-04 04:04:14.192987", + "step": 2114, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:14.297053", + "step": 2114, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015800610184669495, + "timestamp": "2025-09-04 04:04:14.316020", + "step": 2115, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:14.416443", + "step": 2115, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.048695262521505356, + "timestamp": "2025-09-04 04:04:14.435860", + "step": 2116, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:14.535495", + "step": 2116, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002211391692981124, + "timestamp": "2025-09-04 04:04:14.556550", + "step": 2117, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:04:14.666573", + "step": 2117, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006672736257314682, + "timestamp": "2025-09-04 04:04:14.687028", + "step": 2118, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:04:14.803052", + "step": 2118, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010592850856482983, + "timestamp": "2025-09-04 04:04:14.825098", + "step": 2119, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:14.932592", + "step": 2119, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020320625975728035, + "timestamp": "2025-09-04 04:04:14.953122", + "step": 2120, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:23.313391", + "step": 2120, + "epoch": 2 + }, + { + "type": "pplx", + "content": 312.7804928893332, + "timestamp": "2025-09-04 04:04:23.315624", + "step": 2120, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2120", + "timestamp": "2025-09-04 04:04:23.681997", + "step": 2120, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:23.781623", + "step": 2120, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010732216760516167, + "timestamp": "2025-09-04 04:04:23.802720", + "step": 2121, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:04:23.889401", + "step": 2121, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005511862691491842, + "timestamp": "2025-09-04 04:04:23.905182", + "step": 2122, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:23.997677", + "step": 2122, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008789485436864197, + "timestamp": "2025-09-04 04:04:24.014839", + "step": 2123, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 04:04:24.199203", + "step": 2123, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018326403573155403, + "timestamp": "2025-09-04 04:04:24.234414", + "step": 2124, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:24.324777", + "step": 2124, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012555736117064953, + "timestamp": "2025-09-04 04:04:24.343404", + "step": 2125, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:24.453436", + "step": 2125, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0006459427531808615, + "timestamp": "2025-09-04 04:04:24.473852", + "step": 2126, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:24.575742", + "step": 2126, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0454648919403553, + "timestamp": "2025-09-04 04:04:24.594874", + "step": 2127, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:04:24.680520", + "step": 2127, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.10542017221450806, + "timestamp": "2025-09-04 04:04:24.696351", + "step": 2128, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:04:24.782005", + "step": 2128, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014704999513924122, + "timestamp": "2025-09-04 04:04:24.799271", + "step": 2129, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:24.889998", + "step": 2129, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0018945076735690236, + "timestamp": "2025-09-04 04:04:24.906873", + "step": 2130, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:25.005752", + "step": 2130, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03164684399962425, + "timestamp": "2025-09-04 04:04:25.024319", + "step": 2131, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:25.122975", + "step": 2131, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005463764537125826, + "timestamp": "2025-09-04 04:04:25.142239", + "step": 2132, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 04:04:25.323234", + "step": 2132, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008721102960407734, + "timestamp": "2025-09-04 04:04:25.361001", + "step": 2133, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:25.459833", + "step": 2133, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008502230979502201, + "timestamp": "2025-09-04 04:04:25.478374", + "step": 2134, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:04:25.555190", + "step": 2134, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06675466895103455, + "timestamp": "2025-09-04 04:04:25.569241", + "step": 2135, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:25.670339", + "step": 2135, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006031819619238377, + "timestamp": "2025-09-04 04:04:25.689924", + "step": 2136, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:25.788530", + "step": 2136, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005401272792369127, + "timestamp": "2025-09-04 04:04:25.809427", + "step": 2137, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:25.904126", + "step": 2137, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008368422277271748, + "timestamp": "2025-09-04 04:04:25.921786", + "step": 2138, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:26.024282", + "step": 2138, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004673839081078768, + "timestamp": "2025-09-04 04:04:26.043265", + "step": 2139, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:26.153282", + "step": 2139, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03570368513464928, + "timestamp": "2025-09-04 04:04:26.174401", + "step": 2140, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:34.544277", + "step": 2140, + "epoch": 2 + }, + { + "type": "pplx", + "content": 310.4419842128476, + "timestamp": "2025-09-04 04:04:34.546820", + "step": 2140, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:34.636749", + "step": 2140, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012499855365604162, + "timestamp": "2025-09-04 04:04:34.655620", + "step": 2141, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:04:34.731528", + "step": 2141, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006291459314525127, + "timestamp": "2025-09-04 04:04:34.745151", + "step": 2142, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:04:34.853013", + "step": 2142, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0022742494475096464, + "timestamp": "2025-09-04 04:04:34.873107", + "step": 2143, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:34.984340", + "step": 2143, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01345337089151144, + "timestamp": "2025-09-04 04:04:35.005729", + "step": 2144, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:35.094070", + "step": 2144, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03179781138896942, + "timestamp": "2025-09-04 04:04:35.112411", + "step": 2145, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:35.212528", + "step": 2145, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004378853365778923, + "timestamp": "2025-09-04 04:04:35.231121", + "step": 2146, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 04:04:35.435817", + "step": 2146, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0009116848814301193, + "timestamp": "2025-09-04 04:04:35.475134", + "step": 2147, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:35.578468", + "step": 2147, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007793641183525324, + "timestamp": "2025-09-04 04:04:35.598367", + "step": 2148, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:04:35.705536", + "step": 2148, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009497767314314842, + "timestamp": "2025-09-04 04:04:35.728150", + "step": 2149, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:04:35.812631", + "step": 2149, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038677141070365906, + "timestamp": "2025-09-04 04:04:35.827760", + "step": 2150, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:04:35.910668", + "step": 2150, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006493302993476391, + "timestamp": "2025-09-04 04:04:35.925541", + "step": 2151, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:36.021059", + "step": 2151, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006713113281875849, + "timestamp": "2025-09-04 04:04:36.039260", + "step": 2152, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:36.144671", + "step": 2152, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05974787473678589, + "timestamp": "2025-09-04 04:04:36.165757", + "step": 2153, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:36.257893", + "step": 2153, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005305037368088961, + "timestamp": "2025-09-04 04:04:36.274415", + "step": 2154, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:36.375842", + "step": 2154, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018993912264704704, + "timestamp": "2025-09-04 04:04:36.394459", + "step": 2155, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:04:36.471725", + "step": 2155, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02348383143544197, + "timestamp": "2025-09-04 04:04:36.486467", + "step": 2156, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:36.578110", + "step": 2156, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020909776911139488, + "timestamp": "2025-09-04 04:04:36.596903", + "step": 2157, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:36.687043", + "step": 2157, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01592099852859974, + "timestamp": "2025-09-04 04:04:36.703800", + "step": 2158, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:36.813038", + "step": 2158, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.045487433671951294, + "timestamp": "2025-09-04 04:04:36.833248", + "step": 2159, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:36.937059", + "step": 2159, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02561834827065468, + "timestamp": "2025-09-04 04:04:36.956799", + "step": 2160, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:45.360928", + "step": 2160, + "epoch": 2 + }, + { + "type": "pplx", + "content": 308.36794591931954, + "timestamp": "2025-09-04 04:04:45.362993", + "step": 2160, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2160", + "timestamp": "2025-09-04 04:04:45.876044", + "step": 2160, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:45.978684", + "step": 2160, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01050900761038065, + "timestamp": "2025-09-04 04:04:46.000386", + "step": 2161, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:46.105099", + "step": 2161, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030154536943882704, + "timestamp": "2025-09-04 04:04:46.124404", + "step": 2162, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:46.227803", + "step": 2162, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019422519952058792, + "timestamp": "2025-09-04 04:04:46.247095", + "step": 2163, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:46.352915", + "step": 2163, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05433286726474762, + "timestamp": "2025-09-04 04:04:46.373652", + "step": 2164, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:46.464426", + "step": 2164, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020474649965763092, + "timestamp": "2025-09-04 04:04:46.483111", + "step": 2165, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:46.584033", + "step": 2165, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004737554118037224, + "timestamp": "2025-09-04 04:04:46.602973", + "step": 2166, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 04:04:46.733689", + "step": 2166, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017673440743237734, + "timestamp": "2025-09-04 04:04:46.758348", + "step": 2167, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:46.859429", + "step": 2167, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.052274227142333984, + "timestamp": "2025-09-04 04:04:46.879084", + "step": 2168, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:46.977326", + "step": 2168, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004377824254333973, + "timestamp": "2025-09-04 04:04:46.997720", + "step": 2169, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:47.103598", + "step": 2169, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04207998141646385, + "timestamp": "2025-09-04 04:04:47.123611", + "step": 2170, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:04:47.225813", + "step": 2170, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007436053827404976, + "timestamp": "2025-09-04 04:04:47.243190", + "step": 2171, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:04:47.337262", + "step": 2171, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007250096648931503, + "timestamp": "2025-09-04 04:04:47.353490", + "step": 2172, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:04:47.434923", + "step": 2172, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015918496064841747, + "timestamp": "2025-09-04 04:04:47.451549", + "step": 2173, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:47.556307", + "step": 2173, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01956302858889103, + "timestamp": "2025-09-04 04:04:47.575618", + "step": 2174, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:47.681540", + "step": 2174, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04470936954021454, + "timestamp": "2025-09-04 04:04:47.701563", + "step": 2175, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:47.802526", + "step": 2175, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03270625323057175, + "timestamp": "2025-09-04 04:04:47.822128", + "step": 2176, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:47.930127", + "step": 2176, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0014341897331178188, + "timestamp": "2025-09-04 04:04:47.952666", + "step": 2177, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:48.053251", + "step": 2177, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.019161755219101906, + "timestamp": "2025-09-04 04:04:48.071882", + "step": 2178, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:48.181349", + "step": 2178, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0018422268331050873, + "timestamp": "2025-09-04 04:04:48.201898", + "step": 2179, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:48.295085", + "step": 2179, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017137227579951286, + "timestamp": "2025-09-04 04:04:48.312984", + "step": 2180, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:04:56.709103", + "step": 2180, + "epoch": 2 + }, + { + "type": "pplx", + "content": 308.3335132373899, + "timestamp": "2025-09-04 04:04:56.710918", + "step": 2180, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:56.799844", + "step": 2180, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013462487608194351, + "timestamp": "2025-09-04 04:04:56.818604", + "step": 2181, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:04:56.915894", + "step": 2181, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015639422461390495, + "timestamp": "2025-09-04 04:04:56.933526", + "step": 2182, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:57.027042", + "step": 2182, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008944478817284107, + "timestamp": "2025-09-04 04:04:57.044188", + "step": 2183, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:57.147968", + "step": 2183, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.037470266222953796, + "timestamp": "2025-09-04 04:04:57.168011", + "step": 2184, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:57.266544", + "step": 2184, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04356255754828453, + "timestamp": "2025-09-04 04:04:57.287325", + "step": 2185, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:04:57.380483", + "step": 2185, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012546907179057598, + "timestamp": "2025-09-04 04:04:57.397427", + "step": 2186, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:57.504540", + "step": 2186, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0026249419897794724, + "timestamp": "2025-09-04 04:04:57.524297", + "step": 2187, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:57.628114", + "step": 2187, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007875807583332062, + "timestamp": "2025-09-04 04:04:57.647908", + "step": 2188, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:04:57.755912", + "step": 2188, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020884279161691666, + "timestamp": "2025-09-04 04:04:57.778278", + "step": 2189, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:57.881872", + "step": 2189, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010031616315245628, + "timestamp": "2025-09-04 04:04:57.901024", + "step": 2190, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:04:57.984919", + "step": 2190, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01399738434702158, + "timestamp": "2025-09-04 04:04:58.000170", + "step": 2191, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:58.099531", + "step": 2191, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008733275346457958, + "timestamp": "2025-09-04 04:04:58.118882", + "step": 2192, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:04:58.221416", + "step": 2192, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03269082307815552, + "timestamp": "2025-09-04 04:04:58.241771", + "step": 2193, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:04:58.343688", + "step": 2193, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03407390043139458, + "timestamp": "2025-09-04 04:04:58.362657", + "step": 2194, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:58.462986", + "step": 2194, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016373533755540848, + "timestamp": "2025-09-04 04:04:58.481948", + "step": 2195, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:04:58.582160", + "step": 2195, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002695757895708084, + "timestamp": "2025-09-04 04:04:58.601793", + "step": 2196, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:58.702197", + "step": 2196, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03435313701629639, + "timestamp": "2025-09-04 04:04:58.723146", + "step": 2197, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:04:58.829250", + "step": 2197, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011219386011362076, + "timestamp": "2025-09-04 04:04:58.849348", + "step": 2198, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:04:58.940281", + "step": 2198, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018078068271279335, + "timestamp": "2025-09-04 04:04:58.957188", + "step": 2199, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:04:59.062258", + "step": 2199, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028233405202627182, + "timestamp": "2025-09-04 04:04:59.082408", + "step": 2200, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:05:07.470250", + "step": 2200, + "epoch": 2 + }, + { + "type": "pplx", + "content": 312.6614110155863, + "timestamp": "2025-09-04 04:05:07.473143", + "step": 2200, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2200", + "timestamp": "2025-09-04 04:05:07.823843", + "step": 2200, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:07.923299", + "step": 2200, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016595905646681786, + "timestamp": "2025-09-04 04:05:07.944190", + "step": 2201, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:08.039605", + "step": 2201, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016727754846215248, + "timestamp": "2025-09-04 04:05:08.056980", + "step": 2202, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:08.149959", + "step": 2202, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008159826509654522, + "timestamp": "2025-09-04 04:05:08.166773", + "step": 2203, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:05:08.254409", + "step": 2203, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016688521951436996, + "timestamp": "2025-09-04 04:05:08.270697", + "step": 2204, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:08.363758", + "step": 2204, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003086843527853489, + "timestamp": "2025-09-04 04:05:08.382943", + "step": 2205, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:08.487266", + "step": 2205, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003638209542259574, + "timestamp": "2025-09-04 04:05:08.506389", + "step": 2206, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:08.600858", + "step": 2206, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01052644569426775, + "timestamp": "2025-09-04 04:05:08.618092", + "step": 2207, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:08.722308", + "step": 2207, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022792614996433258, + "timestamp": "2025-09-04 04:05:08.742369", + "step": 2208, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:08.846839", + "step": 2208, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002457141410559416, + "timestamp": "2025-09-04 04:05:08.868936", + "step": 2209, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:08.959620", + "step": 2209, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06071026250720024, + "timestamp": "2025-09-04 04:05:08.976336", + "step": 2210, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:09.082718", + "step": 2210, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03304049372673035, + "timestamp": "2025-09-04 04:05:09.101999", + "step": 2211, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:09.205527", + "step": 2211, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00047334007103927433, + "timestamp": "2025-09-04 04:05:09.225143", + "step": 2212, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:09.316744", + "step": 2212, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03290455788373947, + "timestamp": "2025-09-04 04:05:09.335668", + "step": 2213, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:09.430257", + "step": 2213, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007393876556307077, + "timestamp": "2025-09-04 04:05:09.447602", + "step": 2214, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:09.551318", + "step": 2214, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012073284946382046, + "timestamp": "2025-09-04 04:05:09.570585", + "step": 2215, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:09.662952", + "step": 2215, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07519317418336868, + "timestamp": "2025-09-04 04:05:09.680572", + "step": 2216, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:09.781402", + "step": 2216, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04182714223861694, + "timestamp": "2025-09-04 04:05:09.801878", + "step": 2217, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:09.907420", + "step": 2217, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0027501049917191267, + "timestamp": "2025-09-04 04:05:09.926426", + "step": 2218, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:05:10.062445", + "step": 2218, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007170806173235178, + "timestamp": "2025-09-04 04:05:10.088381", + "step": 2219, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:10.184360", + "step": 2219, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01622505858540535, + "timestamp": "2025-09-04 04:05:10.202528", + "step": 2220, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:05:18.580414", + "step": 2220, + "epoch": 2 + }, + { + "type": "pplx", + "content": 317.98368699126075, + "timestamp": "2025-09-04 04:05:18.582752", + "step": 2220, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:18.682258", + "step": 2220, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025658165104687214, + "timestamp": "2025-09-04 04:05:18.703606", + "step": 2221, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:05:18.778872", + "step": 2221, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013223507441580296, + "timestamp": "2025-09-04 04:05:18.792344", + "step": 2222, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:18.885172", + "step": 2222, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010641835629940033, + "timestamp": "2025-09-04 04:05:18.902241", + "step": 2223, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:05:19.036671", + "step": 2223, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011351378634572029, + "timestamp": "2025-09-04 04:05:19.063427", + "step": 2224, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:19.164795", + "step": 2224, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00913854967802763, + "timestamp": "2025-09-04 04:05:19.186006", + "step": 2225, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:05:19.283788", + "step": 2225, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.034181058406829834, + "timestamp": "2025-09-04 04:05:19.301287", + "step": 2226, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:19.392275", + "step": 2226, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016610038001090288, + "timestamp": "2025-09-04 04:05:19.409064", + "step": 2227, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:05:19.485169", + "step": 2227, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02116265520453453, + "timestamp": "2025-09-04 04:05:19.499668", + "step": 2228, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:05:19.601431", + "step": 2228, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006079908460378647, + "timestamp": "2025-09-04 04:05:19.621854", + "step": 2229, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:05:19.731909", + "step": 2229, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019000859465450048, + "timestamp": "2025-09-04 04:05:19.752516", + "step": 2230, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:19.847153", + "step": 2230, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03679632768034935, + "timestamp": "2025-09-04 04:05:19.864479", + "step": 2231, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:05:19.986974", + "step": 2231, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004042410757392645, + "timestamp": "2025-09-04 04:05:20.011031", + "step": 2232, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:20.116757", + "step": 2232, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011536908335983753, + "timestamp": "2025-09-04 04:05:20.137776", + "step": 2233, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 04:05:20.313896", + "step": 2233, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038091272581368685, + "timestamp": "2025-09-04 04:05:20.346627", + "step": 2234, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:20.452628", + "step": 2234, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002943043364211917, + "timestamp": "2025-09-04 04:05:20.472632", + "step": 2235, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:20.579930", + "step": 2235, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009634853340685368, + "timestamp": "2025-09-04 04:05:20.600869", + "step": 2236, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:20.690783", + "step": 2236, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022745907306671143, + "timestamp": "2025-09-04 04:05:20.709227", + "step": 2237, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:20.818333", + "step": 2237, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003655359148979187, + "timestamp": "2025-09-04 04:05:20.838412", + "step": 2238, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:05:20.930756", + "step": 2238, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016437901183962822, + "timestamp": "2025-09-04 04:05:20.946338", + "step": 2239, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:21.053890", + "step": 2239, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01712539792060852, + "timestamp": "2025-09-04 04:05:21.074793", + "step": 2240, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:05:29.475438", + "step": 2240, + "epoch": 2 + }, + { + "type": "pplx", + "content": 320.6073536444029, + "timestamp": "2025-09-04 04:05:29.477692", + "step": 2240, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2240", + "timestamp": "2025-09-04 04:05:29.965605", + "step": 2240, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:30.067717", + "step": 2240, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003082787152379751, + "timestamp": "2025-09-04 04:05:30.089527", + "step": 2241, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:05:30.294473", + "step": 2241, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06166600435972214, + "timestamp": "2025-09-04 04:05:30.333615", + "step": 2242, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:05:30.412898", + "step": 2242, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007548161782324314, + "timestamp": "2025-09-04 04:05:30.427094", + "step": 2243, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:05:30.544314", + "step": 2243, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026522303000092506, + "timestamp": "2025-09-04 04:05:30.567160", + "step": 2244, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:05:30.675609", + "step": 2244, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04962928593158722, + "timestamp": "2025-09-04 04:05:30.697859", + "step": 2245, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:30.789135", + "step": 2245, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001753210905008018, + "timestamp": "2025-09-04 04:05:30.805937", + "step": 2246, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:05:30.891711", + "step": 2246, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030653884168714285, + "timestamp": "2025-09-04 04:05:30.907210", + "step": 2247, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:31.010986", + "step": 2247, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04342466592788696, + "timestamp": "2025-09-04 04:05:31.030783", + "step": 2248, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:05:31.123299", + "step": 2248, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02544695883989334, + "timestamp": "2025-09-04 04:05:31.142550", + "step": 2249, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 04:05:31.317706", + "step": 2249, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005824543070048094, + "timestamp": "2025-09-04 04:05:31.352320", + "step": 2250, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:31.447443", + "step": 2250, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004376427736133337, + "timestamp": "2025-09-04 04:05:31.464644", + "step": 2251, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:05:31.548999", + "step": 2251, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03957490622997284, + "timestamp": "2025-09-04 04:05:31.564806", + "step": 2252, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:05:31.649806", + "step": 2252, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006183784920722246, + "timestamp": "2025-09-04 04:05:31.666917", + "step": 2253, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:05:31.745329", + "step": 2253, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004066129215061665, + "timestamp": "2025-09-04 04:05:31.759132", + "step": 2254, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:31.863127", + "step": 2254, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009590189903974533, + "timestamp": "2025-09-04 04:05:31.882110", + "step": 2255, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:05:31.979430", + "step": 2255, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015844259411096573, + "timestamp": "2025-09-04 04:05:31.997578", + "step": 2256, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:05:32.104647", + "step": 2256, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007126522250473499, + "timestamp": "2025-09-04 04:05:32.126836", + "step": 2257, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:05:32.331721", + "step": 2257, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030276242177933455, + "timestamp": "2025-09-04 04:05:32.370738", + "step": 2258, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:32.476655", + "step": 2258, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01780683733522892, + "timestamp": "2025-09-04 04:05:32.495823", + "step": 2259, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:05:32.596864", + "step": 2259, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012856582179665565, + "timestamp": "2025-09-04 04:05:32.616120", + "step": 2260, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:05:41.117371", + "step": 2260, + "epoch": 2 + }, + { + "type": "pplx", + "content": 319.322025056002, + "timestamp": "2025-09-04 04:05:41.119565", + "step": 2260, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:05:41.234127", + "step": 2260, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.10266165435314178, + "timestamp": "2025-09-04 04:05:41.257967", + "step": 2261, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:41.361408", + "step": 2261, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013549219816923141, + "timestamp": "2025-09-04 04:05:41.380593", + "step": 2262, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:41.482941", + "step": 2262, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038013458251953125, + "timestamp": "2025-09-04 04:05:41.501959", + "step": 2263, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:05:41.596910", + "step": 2263, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06860756129026413, + "timestamp": "2025-09-04 04:05:41.615157", + "step": 2264, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:05:41.707053", + "step": 2264, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04129083827137947, + "timestamp": "2025-09-04 04:05:41.725972", + "step": 2265, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:41.826374", + "step": 2265, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.037230126559734344, + "timestamp": "2025-09-04 04:05:41.845042", + "step": 2266, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:41.947523", + "step": 2266, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009212334640324116, + "timestamp": "2025-09-04 04:05:41.966655", + "step": 2267, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:42.071971", + "step": 2267, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013861672952771187, + "timestamp": "2025-09-04 04:05:42.091888", + "step": 2268, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:05:42.166045", + "step": 2268, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007805598899722099, + "timestamp": "2025-09-04 04:05:42.180729", + "step": 2269, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:42.285975", + "step": 2269, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023477617651224136, + "timestamp": "2025-09-04 04:05:42.305107", + "step": 2270, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:42.405410", + "step": 2270, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031649265438318253, + "timestamp": "2025-09-04 04:05:42.424225", + "step": 2271, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:05:42.524728", + "step": 2271, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018990959972143173, + "timestamp": "2025-09-04 04:05:42.544057", + "step": 2272, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:42.632585", + "step": 2272, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011629465036094189, + "timestamp": "2025-09-04 04:05:42.650953", + "step": 2273, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:05:42.729086", + "step": 2273, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017159185372292995, + "timestamp": "2025-09-04 04:05:42.743117", + "step": 2274, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 04:05:42.957175", + "step": 2274, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012947708368301392, + "timestamp": "2025-09-04 04:05:42.997905", + "step": 2275, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:05:43.092714", + "step": 2275, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03470579907298088, + "timestamp": "2025-09-04 04:05:43.110971", + "step": 2276, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:05:43.187661", + "step": 2276, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005358322989195585, + "timestamp": "2025-09-04 04:05:43.202956", + "step": 2277, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:43.306397", + "step": 2277, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008757129311561584, + "timestamp": "2025-09-04 04:05:43.325338", + "step": 2278, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 04:05:43.453113", + "step": 2278, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001655671396292746, + "timestamp": "2025-09-04 04:05:43.477203", + "step": 2279, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:43.581200", + "step": 2279, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003800937905907631, + "timestamp": "2025-09-04 04:05:43.600971", + "step": 2280, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:05:51.995079", + "step": 2280, + "epoch": 2 + }, + { + "type": "pplx", + "content": 310.31223764687115, + "timestamp": "2025-09-04 04:05:51.997642", + "step": 2280, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2280", + "timestamp": "2025-09-04 04:05:52.345100", + "step": 2280, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:05:52.420573", + "step": 2280, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019430589163675904, + "timestamp": "2025-09-04 04:05:52.435958", + "step": 2281, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:05:52.540294", + "step": 2281, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035052590072155, + "timestamp": "2025-09-04 04:05:52.559389", + "step": 2282, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:52.661540", + "step": 2282, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026764625683426857, + "timestamp": "2025-09-04 04:05:52.680240", + "step": 2283, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:05:52.759445", + "step": 2283, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010539744980633259, + "timestamp": "2025-09-04 04:05:52.774253", + "step": 2284, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:52.872872", + "step": 2284, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03832215815782547, + "timestamp": "2025-09-04 04:05:52.893433", + "step": 2285, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:05:53.002613", + "step": 2285, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005969279911369085, + "timestamp": "2025-09-04 04:05:53.022737", + "step": 2286, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:53.115750", + "step": 2286, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011899287812411785, + "timestamp": "2025-09-04 04:05:53.132397", + "step": 2287, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:05:53.220053", + "step": 2287, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01706940494477749, + "timestamp": "2025-09-04 04:05:53.236433", + "step": 2288, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:05:53.325483", + "step": 2288, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010303936898708344, + "timestamp": "2025-09-04 04:05:53.343787", + "step": 2289, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:05:53.420130", + "step": 2289, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0042968811467289925, + "timestamp": "2025-09-04 04:05:53.433846", + "step": 2290, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:53.527246", + "step": 2290, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.11020837724208832, + "timestamp": "2025-09-04 04:05:53.544270", + "step": 2291, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:53.646159", + "step": 2291, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.2083846777677536, + "timestamp": "2025-09-04 04:05:53.665376", + "step": 2292, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:05:53.766145", + "step": 2292, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007627859245985746, + "timestamp": "2025-09-04 04:05:53.787078", + "step": 2293, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:53.885519", + "step": 2293, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02781866304576397, + "timestamp": "2025-09-04 04:05:53.902604", + "step": 2294, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:54.004580", + "step": 2294, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024229728151112795, + "timestamp": "2025-09-04 04:05:54.023213", + "step": 2295, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:05:54.131939", + "step": 2295, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005142164416611195, + "timestamp": "2025-09-04 04:05:54.152617", + "step": 2296, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:54.252044", + "step": 2296, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004444428253918886, + "timestamp": "2025-09-04 04:05:54.272465", + "step": 2297, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:05:54.366361", + "step": 2297, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008370397612452507, + "timestamp": "2025-09-04 04:05:54.383416", + "step": 2298, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:05:54.484682", + "step": 2298, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020550237968564034, + "timestamp": "2025-09-04 04:05:54.503493", + "step": 2299, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:05:54.614990", + "step": 2299, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.027608707547187805, + "timestamp": "2025-09-04 04:05:54.635981", + "step": 2300, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:03.087262", + "step": 2300, + "epoch": 2 + }, + { + "type": "pplx", + "content": 302.3328329629629, + "timestamp": "2025-09-04 04:06:03.089701", + "step": 2300, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:06:03.163146", + "step": 2300, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01112450659275055, + "timestamp": "2025-09-04 04:06:03.177907", + "step": 2301, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:03.280100", + "step": 2301, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007555335760116577, + "timestamp": "2025-09-04 04:06:03.298865", + "step": 2302, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:06:03.384788", + "step": 2302, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024827582761645317, + "timestamp": "2025-09-04 04:06:03.400041", + "step": 2303, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:06:03.493523", + "step": 2303, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006991108413785696, + "timestamp": "2025-09-04 04:06:03.511385", + "step": 2304, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:03.603632", + "step": 2304, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0047009047120809555, + "timestamp": "2025-09-04 04:06:03.622610", + "step": 2305, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:03.714249", + "step": 2305, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021796375513076782, + "timestamp": "2025-09-04 04:06:03.730911", + "step": 2306, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:03.838853", + "step": 2306, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01497070025652647, + "timestamp": "2025-09-04 04:06:03.857863", + "step": 2307, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:06:03.967870", + "step": 2307, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010811883956193924, + "timestamp": "2025-09-04 04:06:03.989029", + "step": 2308, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:06:04.086437", + "step": 2308, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007137446664273739, + "timestamp": "2025-09-04 04:06:04.105083", + "step": 2309, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:04.208371", + "step": 2309, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00834791362285614, + "timestamp": "2025-09-04 04:06:04.227423", + "step": 2310, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:04.332171", + "step": 2310, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008670263923704624, + "timestamp": "2025-09-04 04:06:04.351382", + "step": 2311, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:04.458108", + "step": 2311, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009029252454638481, + "timestamp": "2025-09-04 04:06:04.476256", + "step": 2312, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:04.576767", + "step": 2312, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004705758765339851, + "timestamp": "2025-09-04 04:06:04.597508", + "step": 2313, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:06:04.707664", + "step": 2313, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0046806796453893185, + "timestamp": "2025-09-04 04:06:04.727925", + "step": 2314, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:04.831143", + "step": 2314, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03220272809267044, + "timestamp": "2025-09-04 04:06:04.850065", + "step": 2315, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:06:04.935355", + "step": 2315, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013215369544923306, + "timestamp": "2025-09-04 04:06:04.951839", + "step": 2316, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:06:05.033934", + "step": 2316, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03970480337738991, + "timestamp": "2025-09-04 04:06:05.050832", + "step": 2317, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:05.140919", + "step": 2317, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0069844783283770084, + "timestamp": "2025-09-04 04:06:05.157670", + "step": 2318, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:05.257832", + "step": 2318, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006692732684314251, + "timestamp": "2025-09-04 04:06:05.276581", + "step": 2319, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:05.380384", + "step": 2319, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02646188624203205, + "timestamp": "2025-09-04 04:06:05.400003", + "step": 2320, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:13.772992", + "step": 2320, + "epoch": 2 + }, + { + "type": "pplx", + "content": 295.6536796467591, + "timestamp": "2025-09-04 04:06:13.775137", + "step": 2320, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2320", + "timestamp": "2025-09-04 04:06:14.126831", + "step": 2320, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:06:14.207180", + "step": 2320, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009197513572871685, + "timestamp": "2025-09-04 04:06:14.223926", + "step": 2321, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:06:14.332576", + "step": 2321, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.08209947496652603, + "timestamp": "2025-09-04 04:06:14.352709", + "step": 2322, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:06:14.489174", + "step": 2322, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024091873317956924, + "timestamp": "2025-09-04 04:06:14.515467", + "step": 2323, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:14.619946", + "step": 2323, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012969830073416233, + "timestamp": "2025-09-04 04:06:14.639869", + "step": 2324, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:06:14.742599", + "step": 2324, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.039127450436353683, + "timestamp": "2025-09-04 04:06:14.764493", + "step": 2325, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:14.868050", + "step": 2325, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004493629559874535, + "timestamp": "2025-09-04 04:06:14.887300", + "step": 2326, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:14.985953", + "step": 2326, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005800614599138498, + "timestamp": "2025-09-04 04:06:15.004519", + "step": 2327, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:06:15.086700", + "step": 2327, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05111802741885185, + "timestamp": "2025-09-04 04:06:15.102670", + "step": 2328, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:06:15.208703", + "step": 2328, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008950461633503437, + "timestamp": "2025-09-04 04:06:15.231340", + "step": 2329, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:06:15.325018", + "step": 2329, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0029820986092090607, + "timestamp": "2025-09-04 04:06:15.342129", + "step": 2330, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:15.442186", + "step": 2330, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014196853153407574, + "timestamp": "2025-09-04 04:06:15.461081", + "step": 2331, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 04:06:15.634661", + "step": 2331, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0035891346633434296, + "timestamp": "2025-09-04 04:06:15.668205", + "step": 2332, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:06:15.785753", + "step": 2332, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016923088114708662, + "timestamp": "2025-09-04 04:06:15.808308", + "step": 2333, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:15.910974", + "step": 2333, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015629819827154279, + "timestamp": "2025-09-04 04:06:15.930249", + "step": 2334, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:06:16.007674", + "step": 2334, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023518525063991547, + "timestamp": "2025-09-04 04:06:16.021688", + "step": 2335, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:16.124739", + "step": 2335, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0419270396232605, + "timestamp": "2025-09-04 04:06:16.144731", + "step": 2336, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:16.245748", + "step": 2336, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005480342078953981, + "timestamp": "2025-09-04 04:06:16.266752", + "step": 2337, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:06:16.377638", + "step": 2337, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024341512471437454, + "timestamp": "2025-09-04 04:06:16.398272", + "step": 2338, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:16.499844", + "step": 2338, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017390906810760498, + "timestamp": "2025-09-04 04:06:16.518835", + "step": 2339, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:16.619506", + "step": 2339, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024550272151827812, + "timestamp": "2025-09-04 04:06:16.639169", + "step": 2340, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:25.127410", + "step": 2340, + "epoch": 2 + }, + { + "type": "pplx", + "content": 295.22586601571055, + "timestamp": "2025-09-04 04:06:25.130299", + "step": 2340, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:06:25.206476", + "step": 2340, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003865597303956747, + "timestamp": "2025-09-04 04:06:25.221563", + "step": 2341, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:25.325472", + "step": 2341, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023083921521902084, + "timestamp": "2025-09-04 04:06:25.344427", + "step": 2342, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 04:06:25.474558", + "step": 2342, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0006907058414071798, + "timestamp": "2025-09-04 04:06:25.497944", + "step": 2343, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:25.591178", + "step": 2343, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01256527565419674, + "timestamp": "2025-09-04 04:06:25.608450", + "step": 2344, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1008 + ], + "flops": 20160122450880.0 + }, + "timestamp": "2025-09-04 04:06:25.751962", + "step": 2344, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0006791522027924657, + "timestamp": "2025-09-04 04:06:25.782760", + "step": 2345, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:25.888362", + "step": 2345, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006552206818014383, + "timestamp": "2025-09-04 04:06:25.907296", + "step": 2346, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:06:26.018399", + "step": 2346, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009997514076530933, + "timestamp": "2025-09-04 04:06:26.038487", + "step": 2347, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:26.138952", + "step": 2347, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028129128739237785, + "timestamp": "2025-09-04 04:06:26.158067", + "step": 2348, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:26.260093", + "step": 2348, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038631020579487085, + "timestamp": "2025-09-04 04:06:26.281059", + "step": 2349, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:26.385515", + "step": 2349, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00890912301838398, + "timestamp": "2025-09-04 04:06:26.404418", + "step": 2350, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:06:26.487101", + "step": 2350, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030983267351984978, + "timestamp": "2025-09-04 04:06:26.500988", + "step": 2351, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:26.592798", + "step": 2351, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04077935963869095, + "timestamp": "2025-09-04 04:06:26.610095", + "step": 2352, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:06:26.726858", + "step": 2352, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0391593798995018, + "timestamp": "2025-09-04 04:06:26.750833", + "step": 2353, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:26.846116", + "step": 2353, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0036758643109351397, + "timestamp": "2025-09-04 04:06:26.863271", + "step": 2354, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:06:26.940414", + "step": 2354, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0077694314531981945, + "timestamp": "2025-09-04 04:06:26.953721", + "step": 2355, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:27.057469", + "step": 2355, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.035263847559690475, + "timestamp": "2025-09-04 04:06:27.077180", + "step": 2356, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:27.176464", + "step": 2356, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017548573669046164, + "timestamp": "2025-09-04 04:06:27.196622", + "step": 2357, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:06:27.333584", + "step": 2357, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0036123136524111032, + "timestamp": "2025-09-04 04:06:27.359290", + "step": 2358, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:27.462130", + "step": 2358, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026867439970374107, + "timestamp": "2025-09-04 04:06:27.480803", + "step": 2359, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:06:27.571768", + "step": 2359, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011943703517317772, + "timestamp": "2025-09-04 04:06:27.587944", + "step": 2360, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:36.064337", + "step": 2360, + "epoch": 2 + }, + { + "type": "pplx", + "content": 298.06816808696493, + "timestamp": "2025-09-04 04:06:36.066522", + "step": 2360, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2360", + "timestamp": "2025-09-04 04:06:36.409656", + "step": 2360, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:36.509058", + "step": 2360, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01579303853213787, + "timestamp": "2025-09-04 04:06:36.530166", + "step": 2361, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:06:36.608506", + "step": 2361, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0023343523498624563, + "timestamp": "2025-09-04 04:06:36.622543", + "step": 2362, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:06:36.700242", + "step": 2362, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.030650924891233444, + "timestamp": "2025-09-04 04:06:36.714227", + "step": 2363, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:06:36.824118", + "step": 2363, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0008962932624854147, + "timestamp": "2025-09-04 04:06:36.845479", + "step": 2364, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:06:36.922413", + "step": 2364, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008728813380002975, + "timestamp": "2025-09-04 04:06:36.937914", + "step": 2365, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:37.041318", + "step": 2365, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0025963473599404097, + "timestamp": "2025-09-04 04:06:37.060611", + "step": 2366, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:06:37.167037", + "step": 2366, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006974723190069199, + "timestamp": "2025-09-04 04:06:37.187013", + "step": 2367, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:37.291474", + "step": 2367, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026150088757276535, + "timestamp": "2025-09-04 04:06:37.311513", + "step": 2368, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:06:37.416575", + "step": 2368, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.033544786274433136, + "timestamp": "2025-09-04 04:06:37.438548", + "step": 2369, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:37.530032", + "step": 2369, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0012710822047665715, + "timestamp": "2025-09-04 04:06:37.546779", + "step": 2370, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:37.640113", + "step": 2370, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004774425644427538, + "timestamp": "2025-09-04 04:06:37.657431", + "step": 2371, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:37.748688", + "step": 2371, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010192320682108402, + "timestamp": "2025-09-04 04:06:37.766189", + "step": 2372, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:06:37.857984", + "step": 2372, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.047795332968235016, + "timestamp": "2025-09-04 04:06:37.877161", + "step": 2373, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:06:37.960434", + "step": 2373, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018767505884170532, + "timestamp": "2025-09-04 04:06:37.975653", + "step": 2374, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:38.073920", + "step": 2374, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017257632687687874, + "timestamp": "2025-09-04 04:06:38.092428", + "step": 2375, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:06:38.194437", + "step": 2375, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007572493981570005, + "timestamp": "2025-09-04 04:06:38.214336", + "step": 2376, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:38.313762", + "step": 2376, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0060096923261880875, + "timestamp": "2025-09-04 04:06:38.334427", + "step": 2377, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:06:38.421155", + "step": 2377, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02415098063647747, + "timestamp": "2025-09-04 04:06:38.436761", + "step": 2378, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:38.530537", + "step": 2378, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016814231872558594, + "timestamp": "2025-09-04 04:06:38.547930", + "step": 2379, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:38.651752", + "step": 2379, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0010228022001683712, + "timestamp": "2025-09-04 04:06:38.671811", + "step": 2380, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:47.058540", + "step": 2380, + "epoch": 2 + }, + { + "type": "pplx", + "content": 300.2741385956129, + "timestamp": "2025-09-04 04:06:47.060601", + "step": 2380, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:47.156440", + "step": 2380, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05420851334929466, + "timestamp": "2025-09-04 04:06:47.177116", + "step": 2381, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:06:47.282495", + "step": 2381, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013719238340854645, + "timestamp": "2025-09-04 04:06:47.302180", + "step": 2382, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:47.405329", + "step": 2382, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01382434368133545, + "timestamp": "2025-09-04 04:06:47.424573", + "step": 2383, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1424 + ], + "flops": 28480172958272.0 + }, + "timestamp": "2025-09-04 04:06:47.635091", + "step": 2383, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017751427367329597, + "timestamp": "2025-09-04 04:06:47.676429", + "step": 2384, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:47.773764", + "step": 2384, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026963358744978905, + "timestamp": "2025-09-04 04:06:47.794292", + "step": 2385, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:47.884251", + "step": 2385, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0162715595215559, + "timestamp": "2025-09-04 04:06:47.901019", + "step": 2386, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:47.994726", + "step": 2386, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009239349514245987, + "timestamp": "2025-09-04 04:06:48.012230", + "step": 2387, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:48.114990", + "step": 2387, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02234196476638317, + "timestamp": "2025-09-04 04:06:48.135143", + "step": 2388, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:06:48.216079", + "step": 2388, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.031179826706647873, + "timestamp": "2025-09-04 04:06:48.232525", + "step": 2389, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:06:48.340364", + "step": 2389, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025329116731882095, + "timestamp": "2025-09-04 04:06:48.360594", + "step": 2390, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:48.461110", + "step": 2390, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022294968366622925, + "timestamp": "2025-09-04 04:06:48.480072", + "step": 2391, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:06:48.576529", + "step": 2391, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0785910040140152, + "timestamp": "2025-09-04 04:06:48.594808", + "step": 2392, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:06:48.668726", + "step": 2392, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024669643491506577, + "timestamp": "2025-09-04 04:06:48.683548", + "step": 2393, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:06:48.782655", + "step": 2393, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023527929559350014, + "timestamp": "2025-09-04 04:06:48.801350", + "step": 2394, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:06:48.917923", + "step": 2394, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01429518312215805, + "timestamp": "2025-09-04 04:06:48.940049", + "step": 2395, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:49.043345", + "step": 2395, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003827124135568738, + "timestamp": "2025-09-04 04:06:49.063366", + "step": 2396, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:06:49.204787", + "step": 2396, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010013996623456478, + "timestamp": "2025-09-04 04:06:49.235597", + "step": 2397, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:06:49.343503", + "step": 2397, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011067106388509274, + "timestamp": "2025-09-04 04:06:49.363807", + "step": 2398, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:06:49.460343", + "step": 2398, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.000826965959277004, + "timestamp": "2025-09-04 04:06:49.477861", + "step": 2399, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:06:49.572217", + "step": 2399, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03683660551905632, + "timestamp": "2025-09-04 04:06:49.590419", + "step": 2400, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:06:57.987370", + "step": 2400, + "epoch": 2 + }, + { + "type": "pplx", + "content": 299.87191027633986, + "timestamp": "2025-09-04 04:06:57.989481", + "step": 2400, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2400", + "timestamp": "2025-09-04 04:06:58.497954", + "step": 2400, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:58.598527", + "step": 2400, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0408879779279232, + "timestamp": "2025-09-04 04:06:58.619076", + "step": 2401, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:06:58.763688", + "step": 2401, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07916318625211716, + "timestamp": "2025-09-04 04:06:58.783853", + "step": 2402, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:06:58.913911", + "step": 2402, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009972754865884781, + "timestamp": "2025-09-04 04:06:58.933370", + "step": 2403, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:06:59.098106", + "step": 2403, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005085110664367676, + "timestamp": "2025-09-04 04:06:59.120132", + "step": 2404, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:06:59.303364", + "step": 2404, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006083235610276461, + "timestamp": "2025-09-04 04:06:59.325785", + "step": 2405, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:06:59.433171", + "step": 2405, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018005046993494034, + "timestamp": "2025-09-04 04:06:59.450765", + "step": 2406, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:06:59.565299", + "step": 2406, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0005014762282371521, + "timestamp": "2025-09-04 04:06:59.584616", + "step": 2407, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:06:59.661630", + "step": 2407, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008235539309680462, + "timestamp": "2025-09-04 04:06:59.675910", + "step": 2408, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:06:59.818065", + "step": 2408, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010737213306128979, + "timestamp": "2025-09-04 04:06:59.837867", + "step": 2409, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:00.009283", + "step": 2409, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03348386660218239, + "timestamp": "2025-09-04 04:07:00.028112", + "step": 2410, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:00.148227", + "step": 2410, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003203654196113348, + "timestamp": "2025-09-04 04:07:00.167426", + "step": 2411, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:00.280154", + "step": 2411, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012616422958672047, + "timestamp": "2025-09-04 04:07:00.301563", + "step": 2412, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:07:00.386966", + "step": 2412, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011091896332800388, + "timestamp": "2025-09-04 04:07:00.403586", + "step": 2413, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 04:07:00.586895", + "step": 2413, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012619656510651112, + "timestamp": "2025-09-04 04:07:00.620884", + "step": 2414, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:00.732799", + "step": 2414, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02890246920287609, + "timestamp": "2025-09-04 04:07:00.751815", + "step": 2415, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:00.890554", + "step": 2415, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023879971355199814, + "timestamp": "2025-09-04 04:07:00.911487", + "step": 2416, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:01.035963", + "step": 2416, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009976472705602646, + "timestamp": "2025-09-04 04:07:01.056672", + "step": 2417, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:07:01.187998", + "step": 2417, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.038926564157009125, + "timestamp": "2025-09-04 04:07:01.201531", + "step": 2418, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:07:01.314823", + "step": 2418, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01832580380141735, + "timestamp": "2025-09-04 04:07:01.332331", + "step": 2419, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:01.421325", + "step": 2419, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00956509169191122, + "timestamp": "2025-09-04 04:07:01.437538", + "step": 2420, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:07:09.904505", + "step": 2420, + "epoch": 2 + }, + { + "type": "pplx", + "content": 303.06925253168487, + "timestamp": "2025-09-04 04:07:09.907076", + "step": 2420, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:07:09.979738", + "step": 2420, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.037451133131980896, + "timestamp": "2025-09-04 04:07:09.994374", + "step": 2421, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:07:10.089818", + "step": 2421, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013809522613883018, + "timestamp": "2025-09-04 04:07:10.107208", + "step": 2422, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:10.209562", + "step": 2422, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03779786825180054, + "timestamp": "2025-09-04 04:07:10.228790", + "step": 2423, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:10.316057", + "step": 2423, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013304756954312325, + "timestamp": "2025-09-04 04:07:10.332431", + "step": 2424, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1232 + ], + "flops": 24640149647168.0 + }, + "timestamp": "2025-09-04 04:07:10.511846", + "step": 2424, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04832831397652626, + "timestamp": "2025-09-04 04:07:10.549441", + "step": 2425, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:10.636716", + "step": 2425, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00974750891327858, + "timestamp": "2025-09-04 04:07:10.652143", + "step": 2426, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:10.756272", + "step": 2426, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04315938055515289, + "timestamp": "2025-09-04 04:07:10.775526", + "step": 2427, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:10.874657", + "step": 2427, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00986342690885067, + "timestamp": "2025-09-04 04:07:10.894019", + "step": 2428, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:07:11.028846", + "step": 2428, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0031775757670402527, + "timestamp": "2025-09-04 04:07:11.057582", + "step": 2429, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:11.143954", + "step": 2429, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024582451209425926, + "timestamp": "2025-09-04 04:07:11.159488", + "step": 2430, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:07:11.269272", + "step": 2430, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.09256591647863388, + "timestamp": "2025-09-04 04:07:11.289807", + "step": 2431, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:11.391712", + "step": 2431, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014577627182006836, + "timestamp": "2025-09-04 04:07:11.411644", + "step": 2432, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:07:11.501022", + "step": 2432, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0009501639287918806, + "timestamp": "2025-09-04 04:07:11.519394", + "step": 2433, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:11.622587", + "step": 2433, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022495364770293236, + "timestamp": "2025-09-04 04:07:11.641457", + "step": 2434, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:07:11.718910", + "step": 2434, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013360848650336266, + "timestamp": "2025-09-04 04:07:11.732852", + "step": 2435, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:11.818320", + "step": 2435, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006631570402532816, + "timestamp": "2025-09-04 04:07:11.834542", + "step": 2436, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:11.933817", + "step": 2436, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004165567457675934, + "timestamp": "2025-09-04 04:07:11.954489", + "step": 2437, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:12.059057", + "step": 2437, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002891003619879484, + "timestamp": "2025-09-04 04:07:12.078289", + "step": 2438, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:12.177201", + "step": 2438, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00025852315593510866, + "timestamp": "2025-09-04 04:07:12.195858", + "step": 2439, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 04:07:12.333048", + "step": 2439, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0613558404147625, + "timestamp": "2025-09-04 04:07:12.360058", + "step": 2440, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:07:20.744500", + "step": 2440, + "epoch": 2 + }, + { + "type": "pplx", + "content": 310.24100791067593, + "timestamp": "2025-09-04 04:07:20.746658", + "step": 2440, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2440", + "timestamp": "2025-09-04 04:07:21.256656", + "step": 2440, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:07:21.373924", + "step": 2440, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0030447612516582012, + "timestamp": "2025-09-04 04:07:21.399154", + "step": 2441, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:07:21.602494", + "step": 2441, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028010720387101173, + "timestamp": "2025-09-04 04:07:21.641816", + "step": 2442, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:21.751643", + "step": 2442, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00379212130792439, + "timestamp": "2025-09-04 04:07:21.772269", + "step": 2443, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:07:21.866937", + "step": 2443, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0061768838204443455, + "timestamp": "2025-09-04 04:07:21.885100", + "step": 2444, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:07:21.975206", + "step": 2444, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011033882386982441, + "timestamp": "2025-09-04 04:07:21.994027", + "step": 2445, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:07:22.104069", + "step": 2445, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01739361509680748, + "timestamp": "2025-09-04 04:07:22.124669", + "step": 2446, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:22.234369", + "step": 2446, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020273465663194656, + "timestamp": "2025-09-04 04:07:22.255184", + "step": 2447, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:22.357414", + "step": 2447, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.021363025531172752, + "timestamp": "2025-09-04 04:07:22.377554", + "step": 2448, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:07:22.470270", + "step": 2448, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012352973222732544, + "timestamp": "2025-09-04 04:07:22.489610", + "step": 2449, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:22.574849", + "step": 2449, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009310700930655003, + "timestamp": "2025-09-04 04:07:22.590465", + "step": 2450, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:07:22.681235", + "step": 2450, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012666204944252968, + "timestamp": "2025-09-04 04:07:22.698051", + "step": 2451, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:22.782113", + "step": 2451, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002646137960255146, + "timestamp": "2025-09-04 04:07:22.798362", + "step": 2452, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:07:22.889307", + "step": 2452, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03072880022227764, + "timestamp": "2025-09-04 04:07:22.908543", + "step": 2453, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:23.012169", + "step": 2453, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01461123675107956, + "timestamp": "2025-09-04 04:07:23.031423", + "step": 2454, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:23.143359", + "step": 2454, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010309387929737568, + "timestamp": "2025-09-04 04:07:23.164090", + "step": 2455, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:23.267195", + "step": 2455, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07364504784345627, + "timestamp": "2025-09-04 04:07:23.287265", + "step": 2456, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:23.371287", + "step": 2456, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028103487566113472, + "timestamp": "2025-09-04 04:07:23.388274", + "step": 2457, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:07:23.516938", + "step": 2457, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03834032267332077, + "timestamp": "2025-09-04 04:07:23.540149", + "step": 2458, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:23.651140", + "step": 2458, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0072427773848176, + "timestamp": "2025-09-04 04:07:23.671797", + "step": 2459, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:23.770961", + "step": 2459, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0034062564373016357, + "timestamp": "2025-09-04 04:07:23.790327", + "step": 2460, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:07:32.212762", + "step": 2460, + "epoch": 2 + }, + { + "type": "pplx", + "content": 315.36553546678925, + "timestamp": "2025-09-04 04:07:32.215085", + "step": 2460, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:07:32.320973", + "step": 2460, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0368022620677948, + "timestamp": "2025-09-04 04:07:32.343546", + "step": 2461, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:32.446908", + "step": 2461, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014917043037712574, + "timestamp": "2025-09-04 04:07:32.466067", + "step": 2462, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:07:32.543207", + "step": 2462, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07748901844024658, + "timestamp": "2025-09-04 04:07:32.557180", + "step": 2463, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:32.661303", + "step": 2463, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006906221155077219, + "timestamp": "2025-09-04 04:07:32.681071", + "step": 2464, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:32.777954", + "step": 2464, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03316435217857361, + "timestamp": "2025-09-04 04:07:32.798213", + "step": 2465, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:07:32.892126", + "step": 2465, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017650446388870478, + "timestamp": "2025-09-04 04:07:32.909510", + "step": 2466, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:07:33.055191", + "step": 2466, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010998114012181759, + "timestamp": "2025-09-04 04:07:33.083267", + "step": 2467, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:07:33.153289", + "step": 2467, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00827596615999937, + "timestamp": "2025-09-04 04:07:33.166670", + "step": 2468, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:33.267008", + "step": 2468, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010986930690705776, + "timestamp": "2025-09-04 04:07:33.288172", + "step": 2469, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:07:33.398361", + "step": 2469, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002616006415337324, + "timestamp": "2025-09-04 04:07:33.418487", + "step": 2470, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 04:07:33.581804", + "step": 2470, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010370907373726368, + "timestamp": "2025-09-04 04:07:33.613941", + "step": 2471, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:07:33.700684", + "step": 2471, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.018909158185124397, + "timestamp": "2025-09-04 04:07:33.716679", + "step": 2472, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:33.821067", + "step": 2472, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03325289860367775, + "timestamp": "2025-09-04 04:07:33.842023", + "step": 2473, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:07:33.912109", + "step": 2473, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038535459898412228, + "timestamp": "2025-09-04 04:07:33.924635", + "step": 2474, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:34.035300", + "step": 2474, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03316061198711395, + "timestamp": "2025-09-04 04:07:34.055652", + "step": 2475, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:34.148736", + "step": 2475, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.042802825570106506, + "timestamp": "2025-09-04 04:07:34.164952", + "step": 2476, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:34.265241", + "step": 2476, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013982797972857952, + "timestamp": "2025-09-04 04:07:34.286061", + "step": 2477, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:34.388236", + "step": 2477, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00205409643240273, + "timestamp": "2025-09-04 04:07:34.406622", + "step": 2478, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:07:34.489076", + "step": 2478, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0019991924054920673, + "timestamp": "2025-09-04 04:07:34.503233", + "step": 2479, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:07:34.612716", + "step": 2479, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03043895773589611, + "timestamp": "2025-09-04 04:07:34.633505", + "step": 2480, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:07:43.012781", + "step": 2480, + "epoch": 2 + }, + { + "type": "pplx", + "content": 318.30207618622006, + "timestamp": "2025-09-04 04:07:43.014765", + "step": 2480, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2480", + "timestamp": "2025-09-04 04:07:43.373304", + "step": 2480, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:43.456246", + "step": 2480, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.023624232038855553, + "timestamp": "2025-09-04 04:07:43.473367", + "step": 2481, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:07:43.567295", + "step": 2481, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028039980679750443, + "timestamp": "2025-09-04 04:07:43.584815", + "step": 2482, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:07:43.691965", + "step": 2482, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04251798987388611, + "timestamp": "2025-09-04 04:07:43.712226", + "step": 2483, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:07:43.788241", + "step": 2483, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013368581421673298, + "timestamp": "2025-09-04 04:07:43.802722", + "step": 2484, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:07:43.906725", + "step": 2484, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00481030810624361, + "timestamp": "2025-09-04 04:07:43.928625", + "step": 2485, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:44.039992", + "step": 2485, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03684372827410698, + "timestamp": "2025-09-04 04:07:44.060654", + "step": 2486, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:07:44.153521", + "step": 2486, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005444700364023447, + "timestamp": "2025-09-04 04:07:44.170668", + "step": 2487, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:44.270187", + "step": 2487, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0024230824783444405, + "timestamp": "2025-09-04 04:07:44.289865", + "step": 2488, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1088 + ], + "flops": 21760132163840.0 + }, + "timestamp": "2025-09-04 04:07:44.443208", + "step": 2488, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026805497705936432, + "timestamp": "2025-09-04 04:07:44.476712", + "step": 2489, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:44.579744", + "step": 2489, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02435290813446045, + "timestamp": "2025-09-04 04:07:44.599073", + "step": 2490, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:07:44.692908", + "step": 2490, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0021042758598923683, + "timestamp": "2025-09-04 04:07:44.710316", + "step": 2491, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:07:44.819017", + "step": 2491, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005590126849710941, + "timestamp": "2025-09-04 04:07:44.840113", + "step": 2492, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:07:44.923546", + "step": 2492, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008740575052797794, + "timestamp": "2025-09-04 04:07:44.940517", + "step": 2493, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:45.026735", + "step": 2493, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024248680099844933, + "timestamp": "2025-09-04 04:07:45.042313", + "step": 2494, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:45.142261", + "step": 2494, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028292085975408554, + "timestamp": "2025-09-04 04:07:45.161082", + "step": 2495, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:07:45.255265", + "step": 2495, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014871833845973015, + "timestamp": "2025-09-04 04:07:45.273507", + "step": 2496, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:45.374285", + "step": 2496, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.017097758129239082, + "timestamp": "2025-09-04 04:07:45.395462", + "step": 2497, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:07:45.515770", + "step": 2497, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014303861185908318, + "timestamp": "2025-09-04 04:07:45.537472", + "step": 2498, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:07:45.628732", + "step": 2498, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0032022215891629457, + "timestamp": "2025-09-04 04:07:45.645560", + "step": 2499, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:07:45.740982", + "step": 2499, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0452834852039814, + "timestamp": "2025-09-04 04:07:45.759178", + "step": 2500, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:07:54.128547", + "step": 2500, + "epoch": 2 + }, + { + "type": "pplx", + "content": 319.347960568569, + "timestamp": "2025-09-04 04:07:54.130884", + "step": 2500, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 04:07:54.190519", + "step": 2500, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.007545833010226488, + "timestamp": "2025-09-04 04:07:54.202250", + "step": 2501, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:07:54.284283", + "step": 2501, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008382921107113361, + "timestamp": "2025-09-04 04:07:54.299354", + "step": 2502, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 04:07:54.428234", + "step": 2502, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010973788797855377, + "timestamp": "2025-09-04 04:07:54.452789", + "step": 2503, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:54.538660", + "step": 2503, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04710651561617851, + "timestamp": "2025-09-04 04:07:54.555139", + "step": 2504, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:54.661887", + "step": 2504, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00142197054810822, + "timestamp": "2025-09-04 04:07:54.684651", + "step": 2505, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:07:54.794898", + "step": 2505, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04192418232560158, + "timestamp": "2025-09-04 04:07:54.815339", + "step": 2506, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:54.918276", + "step": 2506, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024896910414099693, + "timestamp": "2025-09-04 04:07:54.937467", + "step": 2507, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:55.038731", + "step": 2507, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02149958163499832, + "timestamp": "2025-09-04 04:07:55.058382", + "step": 2508, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:07:55.145206", + "step": 2508, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012380536645650864, + "timestamp": "2025-09-04 04:07:55.160522", + "step": 2509, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:07:55.270997", + "step": 2509, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0016803938196972013, + "timestamp": "2025-09-04 04:07:55.291308", + "step": 2510, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:07:55.396245", + "step": 2510, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015394407324492931, + "timestamp": "2025-09-04 04:07:55.415316", + "step": 2511, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:55.518957", + "step": 2511, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005977288819849491, + "timestamp": "2025-09-04 04:07:55.538996", + "step": 2512, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:07:55.626412", + "step": 2512, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01742619276046753, + "timestamp": "2025-09-04 04:07:55.644714", + "step": 2513, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:55.747687", + "step": 2513, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006442517042160034, + "timestamp": "2025-09-04 04:07:55.767073", + "step": 2514, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:55.866978", + "step": 2514, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013937929645180702, + "timestamp": "2025-09-04 04:07:55.885738", + "step": 2515, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:07:55.987972", + "step": 2515, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005017734598368406, + "timestamp": "2025-09-04 04:07:56.007994", + "step": 2516, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:07:56.125464", + "step": 2516, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002909827046096325, + "timestamp": "2025-09-04 04:07:56.150788", + "step": 2517, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:07:56.236953", + "step": 2517, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0124394865706563, + "timestamp": "2025-09-04 04:07:56.252576", + "step": 2518, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:07:56.352113", + "step": 2518, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00413033552467823, + "timestamp": "2025-09-04 04:07:56.370492", + "step": 2519, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:07:56.470526", + "step": 2519, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.044936034828424454, + "timestamp": "2025-09-04 04:07:56.490181", + "step": 2520, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:04.841099", + "step": 2520, + "epoch": 2 + }, + { + "type": "pplx", + "content": 321.3946038942604, + "timestamp": "2025-09-04 04:08:04.843128", + "step": 2520, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2520", + "timestamp": "2025-09-04 04:08:05.181843", + "step": 2520, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:05.281078", + "step": 2520, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004612304270267487, + "timestamp": "2025-09-04 04:08:05.301969", + "step": 2521, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:05.397001", + "step": 2521, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.028999704867601395, + "timestamp": "2025-09-04 04:08:05.414484", + "step": 2522, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:05.518293", + "step": 2522, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006988754495978355, + "timestamp": "2025-09-04 04:08:05.537400", + "step": 2523, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:05.636988", + "step": 2523, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.07554414868354797, + "timestamp": "2025-09-04 04:08:05.656374", + "step": 2524, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:05.756212", + "step": 2524, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004102020058780909, + "timestamp": "2025-09-04 04:08:05.776850", + "step": 2525, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:05.885428", + "step": 2525, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0004931488656438887, + "timestamp": "2025-09-04 04:08:05.904557", + "step": 2526, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:06.012440", + "step": 2526, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02074488066136837, + "timestamp": "2025-09-04 04:08:06.032722", + "step": 2527, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:08:06.148947", + "step": 2527, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.00027644523652270436, + "timestamp": "2025-09-04 04:08:06.171613", + "step": 2528, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:06.263638", + "step": 2528, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0038577071391046047, + "timestamp": "2025-09-04 04:08:06.282745", + "step": 2529, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:08:06.360604", + "step": 2529, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026262257248163223, + "timestamp": "2025-09-04 04:08:06.374738", + "step": 2530, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:08:06.481073", + "step": 2530, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01689167320728302, + "timestamp": "2025-09-04 04:08:06.500842", + "step": 2531, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:06.603402", + "step": 2531, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04883643984794617, + "timestamp": "2025-09-04 04:08:06.623391", + "step": 2532, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:08:06.706852", + "step": 2532, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005458258092403412, + "timestamp": "2025-09-04 04:08:06.723480", + "step": 2533, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:08:06.840181", + "step": 2533, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01368082407861948, + "timestamp": "2025-09-04 04:08:06.862251", + "step": 2534, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:08:06.938420", + "step": 2534, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008800865150988102, + "timestamp": "2025-09-04 04:08:06.952160", + "step": 2535, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:08:07.058634", + "step": 2535, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0017213658429682255, + "timestamp": "2025-09-04 04:08:07.079073", + "step": 2536, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:08:07.187701", + "step": 2536, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024919630959630013, + "timestamp": "2025-09-04 04:08:07.210278", + "step": 2537, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:07.303713", + "step": 2537, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005876360926777124, + "timestamp": "2025-09-04 04:08:07.320835", + "step": 2538, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:08:07.405122", + "step": 2538, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06584823131561279, + "timestamp": "2025-09-04 04:08:07.420373", + "step": 2539, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:07.513278", + "step": 2539, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01981205679476261, + "timestamp": "2025-09-04 04:08:07.531141", + "step": 2540, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:15.901844", + "step": 2540, + "epoch": 2 + }, + { + "type": "pplx", + "content": 322.01266845314177, + "timestamp": "2025-09-04 04:08:15.903878", + "step": 2540, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:08:16.008572", + "step": 2540, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0399375818669796, + "timestamp": "2025-09-04 04:08:16.031089", + "step": 2541, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:08:16.140199", + "step": 2541, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004767777398228645, + "timestamp": "2025-09-04 04:08:16.160739", + "step": 2542, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:16.256471", + "step": 2542, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02117767557501793, + "timestamp": "2025-09-04 04:08:16.273940", + "step": 2543, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:08:16.344595", + "step": 2543, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010128835216164589, + "timestamp": "2025-09-04 04:08:16.358124", + "step": 2544, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:08:16.439671", + "step": 2544, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0018931415397673845, + "timestamp": "2025-09-04 04:08:16.456305", + "step": 2545, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:16.555368", + "step": 2545, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.011107025668025017, + "timestamp": "2025-09-04 04:08:16.573896", + "step": 2546, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:16.674332", + "step": 2546, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.022087909281253815, + "timestamp": "2025-09-04 04:08:16.693190", + "step": 2547, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:16.796126", + "step": 2547, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04829714074730873, + "timestamp": "2025-09-04 04:08:16.816098", + "step": 2548, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:16.916158", + "step": 2548, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.009213853627443314, + "timestamp": "2025-09-04 04:08:16.936557", + "step": 2549, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:08:17.043596", + "step": 2549, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029857391491532326, + "timestamp": "2025-09-04 04:08:17.063569", + "step": 2550, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:17.167443", + "step": 2550, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004018782638013363, + "timestamp": "2025-09-04 04:08:17.186721", + "step": 2551, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:08:17.261179", + "step": 2551, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.029023800045251846, + "timestamp": "2025-09-04 04:08:17.275555", + "step": 2552, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:17.366789", + "step": 2552, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.026715071871876717, + "timestamp": "2025-09-04 04:08:17.388002", + "step": 2553, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:08:17.540389", + "step": 2553, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.048667825758457184, + "timestamp": "2025-09-04 04:08:17.560427", + "step": 2554, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:08:17.700200", + "step": 2554, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02002848871052265, + "timestamp": "2025-09-04 04:08:17.715659", + "step": 2555, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:17.829153", + "step": 2555, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015101251192390919, + "timestamp": "2025-09-04 04:08:17.847046", + "step": 2556, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:08:17.937680", + "step": 2556, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001839878037571907, + "timestamp": "2025-09-04 04:08:17.956055", + "step": 2557, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:08:18.033760", + "step": 2557, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0144809540361166, + "timestamp": "2025-09-04 04:08:18.047547", + "step": 2558, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:18.152111", + "step": 2558, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002240030327811837, + "timestamp": "2025-09-04 04:08:18.171263", + "step": 2559, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:18.273966", + "step": 2559, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006072205025702715, + "timestamp": "2025-09-04 04:08:18.293971", + "step": 2560, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:26.673676", + "step": 2560, + "epoch": 2 + }, + { + "type": "pplx", + "content": 321.77623584602946, + "timestamp": "2025-09-04 04:08:26.675704", + "step": 2560, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2560", + "timestamp": "2025-09-04 04:08:27.019902", + "step": 2560, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:08:27.100761", + "step": 2560, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025917798280715942, + "timestamp": "2025-09-04 04:08:27.117323", + "step": 2561, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:27.225582", + "step": 2561, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.025179943069815636, + "timestamp": "2025-09-04 04:08:27.245639", + "step": 2562, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:27.347057", + "step": 2562, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014063333161175251, + "timestamp": "2025-09-04 04:08:27.366276", + "step": 2563, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:27.460070", + "step": 2563, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0029370649717748165, + "timestamp": "2025-09-04 04:08:27.477708", + "step": 2564, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:27.582741", + "step": 2564, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.014166107401251793, + "timestamp": "2025-09-04 04:08:27.604754", + "step": 2565, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:08:27.715409", + "step": 2565, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.06013331934809685, + "timestamp": "2025-09-04 04:08:27.736200", + "step": 2566, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:08:27.881746", + "step": 2566, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.008028823882341385, + "timestamp": "2025-09-04 04:08:27.909876", + "step": 2567, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:28.010271", + "step": 2567, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04439549893140793, + "timestamp": "2025-09-04 04:08:28.029939", + "step": 2568, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:28.120319", + "step": 2568, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.001983765745535493, + "timestamp": "2025-09-04 04:08:28.139120", + "step": 2569, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:08:28.222574", + "step": 2569, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.13214930891990662, + "timestamp": "2025-09-04 04:08:28.237917", + "step": 2570, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:28.341573", + "step": 2570, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02340877056121826, + "timestamp": "2025-09-04 04:08:28.360697", + "step": 2571, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:08:28.437650", + "step": 2571, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03975323215126991, + "timestamp": "2025-09-04 04:08:28.452202", + "step": 2572, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:08:28.534148", + "step": 2572, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.013518854975700378, + "timestamp": "2025-09-04 04:08:28.551256", + "step": 2573, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:28.660249", + "step": 2573, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.039588313549757004, + "timestamp": "2025-09-04 04:08:28.680318", + "step": 2574, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:08:28.788512", + "step": 2574, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.002232232363894582, + "timestamp": "2025-09-04 04:08:28.809195", + "step": 2575, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:28.901049", + "step": 2575, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.015773437917232513, + "timestamp": "2025-09-04 04:08:28.919005", + "step": 2576, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:29.009843", + "step": 2576, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01339112501591444, + "timestamp": "2025-09-04 04:08:29.028845", + "step": 2577, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:29.130668", + "step": 2577, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.016942497342824936, + "timestamp": "2025-09-04 04:08:29.149712", + "step": 2578, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:08:29.223347", + "step": 2578, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.024486076086759567, + "timestamp": "2025-09-04 04:08:29.237004", + "step": 2579, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:08:29.320887", + "step": 2579, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.03973688185214996, + "timestamp": "2025-09-04 04:08:29.337239", + "step": 2580, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:37.715930", + "step": 2580, + "epoch": 2 + }, + { + "type": "pplx", + "content": 317.32499137431523, + "timestamp": "2025-09-04 04:08:37.718369", + "step": 2580, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:37.815580", + "step": 2580, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004058394581079483, + "timestamp": "2025-09-04 04:08:37.836637", + "step": 2581, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:08:37.923125", + "step": 2581, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.006692761089652777, + "timestamp": "2025-09-04 04:08:37.938509", + "step": 2582, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:38.034616", + "step": 2582, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01003364846110344, + "timestamp": "2025-09-04 04:08:38.052107", + "step": 2583, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:38.147127", + "step": 2583, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0015831181081011891, + "timestamp": "2025-09-04 04:08:38.165025", + "step": 2584, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:38.257632", + "step": 2584, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.02085372433066368, + "timestamp": "2025-09-04 04:08:38.276733", + "step": 2585, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:38.379441", + "step": 2585, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012194113805890083, + "timestamp": "2025-09-04 04:08:38.398062", + "step": 2586, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:38.497361", + "step": 2586, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.020145049318671227, + "timestamp": "2025-09-04 04:08:38.515933", + "step": 2587, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:38.611335", + "step": 2587, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01815951056778431, + "timestamp": "2025-09-04 04:08:38.629608", + "step": 2588, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:08:38.737350", + "step": 2588, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0061141857877373695, + "timestamp": "2025-09-04 04:08:38.760046", + "step": 2589, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:08:38.843486", + "step": 2589, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.04689551517367363, + "timestamp": "2025-09-04 04:08:38.858515", + "step": 2590, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:38.962843", + "step": 2590, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.005039518233388662, + "timestamp": "2025-09-04 04:08:38.982095", + "step": 2591, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:39.083944", + "step": 2591, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003636001143604517, + "timestamp": "2025-09-04 04:08:39.103553", + "step": 2592, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:39.194471", + "step": 2592, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0033800839446485043, + "timestamp": "2025-09-04 04:08:39.213259", + "step": 2593, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:08:39.323734", + "step": 2593, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.01337014976888895, + "timestamp": "2025-09-04 04:08:39.344344", + "step": 2594, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:39.445586", + "step": 2594, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.003958144225180149, + "timestamp": "2025-09-04 04:08:39.464458", + "step": 2595, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 04:08:39.598304", + "step": 2595, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.004944264888763428, + "timestamp": "2025-09-04 04:08:39.623718", + "step": 2596, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:39.715344", + "step": 2596, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.012598078697919846, + "timestamp": "2025-09-04 04:08:39.734053", + "step": 2597, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:08:39.811419", + "step": 2597, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.010607503354549408, + "timestamp": "2025-09-04 04:08:39.825419", + "step": 2598, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:08:39.918935", + "step": 2598, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.05367982015013695, + "timestamp": "2025-09-04 04:08:39.936039", + "step": 2599, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:08:40.045928", + "step": 2599, + "epoch": 2 + }, + { + "type": "loss", + "content": 0.0185086727142334, + "timestamp": "2025-09-04 04:08:40.067205", + "step": 2600, + "epoch": 2 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:48.439146", + "step": 2600, + "epoch": 2 + }, + { + "type": "pplx", + "content": 314.9293203900891, + "timestamp": "2025-09-04 04:08:48.441562", + "step": 2600, + "epoch": 2 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2600", + "timestamp": "2025-09-04 04:08:48.943661", + "step": 2600, + "epoch": 2 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:08:49.017512", + "step": 2600, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002318183658644557, + "timestamp": "2025-09-04 04:08:49.032219", + "step": 2601, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:49.134643", + "step": 2601, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02105123922228813, + "timestamp": "2025-09-04 04:08:49.153886", + "step": 2602, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:08:49.255669", + "step": 2602, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02376515232026577, + "timestamp": "2025-09-04 04:08:49.274554", + "step": 2603, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:08:49.378640", + "step": 2603, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027340862900018692, + "timestamp": "2025-09-04 04:08:49.398723", + "step": 2604, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:49.490403", + "step": 2604, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007925122044980526, + "timestamp": "2025-09-04 04:08:49.509557", + "step": 2605, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:49.603779", + "step": 2605, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006013544742017984, + "timestamp": "2025-09-04 04:08:49.621155", + "step": 2606, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:49.733300", + "step": 2606, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011563356965780258, + "timestamp": "2025-09-04 04:08:49.753611", + "step": 2607, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:08:49.856580", + "step": 2607, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006092959549278021, + "timestamp": "2025-09-04 04:08:49.876556", + "step": 2608, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:08:49.980849", + "step": 2608, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01699855737388134, + "timestamp": "2025-09-04 04:08:50.002771", + "step": 2609, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:08:50.097356", + "step": 2609, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002047803020104766, + "timestamp": "2025-09-04 04:08:50.114758", + "step": 2610, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:08:50.187652", + "step": 2610, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028143590316176414, + "timestamp": "2025-09-04 04:08:50.200565", + "step": 2611, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:50.295727", + "step": 2611, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0071517350152134895, + "timestamp": "2025-09-04 04:08:50.313984", + "step": 2612, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:08:50.410801", + "step": 2612, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01916099339723587, + "timestamp": "2025-09-04 04:08:50.431216", + "step": 2613, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:08:50.518259", + "step": 2613, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006362794432789087, + "timestamp": "2025-09-04 04:08:50.533877", + "step": 2614, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:50.643528", + "step": 2614, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006359405815601349, + "timestamp": "2025-09-04 04:08:50.663843", + "step": 2615, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:08:50.750539", + "step": 2615, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024103245232254267, + "timestamp": "2025-09-04 04:08:50.766946", + "step": 2616, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:08:50.859755", + "step": 2616, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007830057293176651, + "timestamp": "2025-09-04 04:08:50.878988", + "step": 2617, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:08:50.956543", + "step": 2617, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05507553741335869, + "timestamp": "2025-09-04 04:08:50.970499", + "step": 2618, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:08:51.042830", + "step": 2618, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007564071100205183, + "timestamp": "2025-09-04 04:08:51.055769", + "step": 2619, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:08:51.165873", + "step": 2619, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021915908437222242, + "timestamp": "2025-09-04 04:08:51.187298", + "step": 2620, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:08:59.590829", + "step": 2620, + "epoch": 3 + }, + { + "type": "pplx", + "content": 315.285162165603, + "timestamp": "2025-09-04 04:08:59.592803", + "step": 2620, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:08:59.696649", + "step": 2620, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0060343146324157715, + "timestamp": "2025-09-04 04:08:59.718904", + "step": 2621, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1392 + ], + "flops": 27840169073088.0 + }, + "timestamp": "2025-09-04 04:08:59.923521", + "step": 2621, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053197117522358894, + "timestamp": "2025-09-04 04:08:59.963010", + "step": 2622, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:09:00.073876", + "step": 2622, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020664365962147713, + "timestamp": "2025-09-04 04:09:00.094500", + "step": 2623, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 04:09:00.309867", + "step": 2623, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015484297648072243, + "timestamp": "2025-09-04 04:09:00.351553", + "step": 2624, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:00.452502", + "step": 2624, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002410691697150469, + "timestamp": "2025-09-04 04:09:00.473673", + "step": 2625, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:00.579481", + "step": 2625, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019456349313259125, + "timestamp": "2025-09-04 04:09:00.599522", + "step": 2626, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:00.690498", + "step": 2626, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008297095075249672, + "timestamp": "2025-09-04 04:09:00.707257", + "step": 2627, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:00.807017", + "step": 2627, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003302738768979907, + "timestamp": "2025-09-04 04:09:00.826356", + "step": 2628, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:00.932212", + "step": 2628, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.11121071875095367, + "timestamp": "2025-09-04 04:09:00.954440", + "step": 2629, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:09:01.068831", + "step": 2629, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02678021229803562, + "timestamp": "2025-09-04 04:09:01.089309", + "step": 2630, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:01.195192", + "step": 2630, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00880197249352932, + "timestamp": "2025-09-04 04:09:01.214228", + "step": 2631, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:01.308951", + "step": 2631, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011775809340178967, + "timestamp": "2025-09-04 04:09:01.326840", + "step": 2632, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:09:01.400378", + "step": 2632, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010084496811032295, + "timestamp": "2025-09-04 04:09:01.415169", + "step": 2633, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:01.504764", + "step": 2633, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001346323057077825, + "timestamp": "2025-09-04 04:09:01.521617", + "step": 2634, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:01.623955", + "step": 2634, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016563931480050087, + "timestamp": "2025-09-04 04:09:01.643140", + "step": 2635, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:01.739409", + "step": 2635, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002511340891942382, + "timestamp": "2025-09-04 04:09:01.757733", + "step": 2636, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:01.863630", + "step": 2636, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.1092916801571846, + "timestamp": "2025-09-04 04:09:01.886104", + "step": 2637, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:01.995722", + "step": 2637, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016762439161539078, + "timestamp": "2025-09-04 04:09:02.016030", + "step": 2638, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:09:02.099780", + "step": 2638, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016962930094450712, + "timestamp": "2025-09-04 04:09:02.115114", + "step": 2639, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:09:02.191928", + "step": 2639, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01071829255670309, + "timestamp": "2025-09-04 04:09:02.206841", + "step": 2640, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:09:10.584308", + "step": 2640, + "epoch": 3 + }, + { + "type": "pplx", + "content": 312.75249298706785, + "timestamp": "2025-09-04 04:09:10.586294", + "step": 2640, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2640", + "timestamp": "2025-09-04 04:09:10.941242", + "step": 2640, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:09:11.023411", + "step": 2640, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014106813818216324, + "timestamp": "2025-09-04 04:09:11.039997", + "step": 2641, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:09:11.139999", + "step": 2641, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005073550622910261, + "timestamp": "2025-09-04 04:09:11.158856", + "step": 2642, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:11.265331", + "step": 2642, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013524402864277363, + "timestamp": "2025-09-04 04:09:11.285345", + "step": 2643, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:11.368162", + "step": 2643, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009907763451337814, + "timestamp": "2025-09-04 04:09:11.384138", + "step": 2644, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:09:11.457840", + "step": 2644, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002835202729329467, + "timestamp": "2025-09-04 04:09:11.472809", + "step": 2645, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:11.581086", + "step": 2645, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016281316056847572, + "timestamp": "2025-09-04 04:09:11.601399", + "step": 2646, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:11.704336", + "step": 2646, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024171601980924606, + "timestamp": "2025-09-04 04:09:11.723618", + "step": 2647, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:11.819889", + "step": 2647, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002636190503835678, + "timestamp": "2025-09-04 04:09:11.837434", + "step": 2648, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:11.934303", + "step": 2648, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00617353105917573, + "timestamp": "2025-09-04 04:09:11.954720", + "step": 2649, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:12.047604", + "step": 2649, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027180935721844435, + "timestamp": "2025-09-04 04:09:12.064695", + "step": 2650, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:12.166834", + "step": 2650, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004888160619884729, + "timestamp": "2025-09-04 04:09:12.186041", + "step": 2651, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:12.269214", + "step": 2651, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00197249511256814, + "timestamp": "2025-09-04 04:09:12.285048", + "step": 2652, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:12.388267", + "step": 2652, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006813482032157481, + "timestamp": "2025-09-04 04:09:12.410251", + "step": 2653, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:12.505553", + "step": 2653, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030994487460702658, + "timestamp": "2025-09-04 04:09:12.523054", + "step": 2654, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:12.612906", + "step": 2654, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002902757376432419, + "timestamp": "2025-09-04 04:09:12.629822", + "step": 2655, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:12.707648", + "step": 2655, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018415531376376748, + "timestamp": "2025-09-04 04:09:12.722595", + "step": 2656, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:09:12.820427", + "step": 2656, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015612171031534672, + "timestamp": "2025-09-04 04:09:12.841224", + "step": 2657, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:12.941218", + "step": 2657, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.042000770568847656, + "timestamp": "2025-09-04 04:09:12.959702", + "step": 2658, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:13.054817", + "step": 2658, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024185827933251858, + "timestamp": "2025-09-04 04:09:13.072416", + "step": 2659, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:13.178902", + "step": 2659, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02803855948150158, + "timestamp": "2025-09-04 04:09:13.199805", + "step": 2660, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:09:21.711234", + "step": 2660, + "epoch": 3 + }, + { + "type": "pplx", + "content": 310.74890931755016, + "timestamp": "2025-09-04 04:09:21.713177", + "step": 2660, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:21.805471", + "step": 2660, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010291390353813767, + "timestamp": "2025-09-04 04:09:21.824559", + "step": 2661, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:09:21.934579", + "step": 2661, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005789092276245356, + "timestamp": "2025-09-04 04:09:21.955070", + "step": 2662, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:22.046012", + "step": 2662, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015148711390793324, + "timestamp": "2025-09-04 04:09:22.062794", + "step": 2663, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 04:09:22.282186", + "step": 2663, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004448336083441973, + "timestamp": "2025-09-04 04:09:22.325169", + "step": 2664, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:09:22.417654", + "step": 2664, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008569728815928102, + "timestamp": "2025-09-04 04:09:22.436738", + "step": 2665, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:22.545585", + "step": 2665, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.034380171447992325, + "timestamp": "2025-09-04 04:09:22.565817", + "step": 2666, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 04:09:22.695282", + "step": 2666, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020487092435359955, + "timestamp": "2025-09-04 04:09:22.718863", + "step": 2667, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:22.824617", + "step": 2667, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004988936707377434, + "timestamp": "2025-09-04 04:09:22.845347", + "step": 2668, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:09:22.928065", + "step": 2668, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038620613049715757, + "timestamp": "2025-09-04 04:09:22.944722", + "step": 2669, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:23.048743", + "step": 2669, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018743831664323807, + "timestamp": "2025-09-04 04:09:23.068031", + "step": 2670, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:23.146345", + "step": 2670, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.044233791530132294, + "timestamp": "2025-09-04 04:09:23.160449", + "step": 2671, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:23.268352", + "step": 2671, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004456370137631893, + "timestamp": "2025-09-04 04:09:23.289464", + "step": 2672, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:23.386870", + "step": 2672, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.08841562271118164, + "timestamp": "2025-09-04 04:09:23.407241", + "step": 2673, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:09:23.518418", + "step": 2673, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005260918755084276, + "timestamp": "2025-09-04 04:09:23.539009", + "step": 2674, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:23.645223", + "step": 2674, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009504856541752815, + "timestamp": "2025-09-04 04:09:23.665320", + "step": 2675, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:09:23.749594", + "step": 2675, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0136948861181736, + "timestamp": "2025-09-04 04:09:23.765576", + "step": 2676, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:23.847090", + "step": 2676, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00021050203940831125, + "timestamp": "2025-09-04 04:09:23.863709", + "step": 2677, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:23.963423", + "step": 2677, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007187894079834223, + "timestamp": "2025-09-04 04:09:23.981938", + "step": 2678, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:09:24.093209", + "step": 2678, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02946031652390957, + "timestamp": "2025-09-04 04:09:24.113901", + "step": 2679, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:24.220814", + "step": 2679, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004950105212628841, + "timestamp": "2025-09-04 04:09:24.241589", + "step": 2680, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:09:32.746359", + "step": 2680, + "epoch": 3 + }, + { + "type": "pplx", + "content": 310.73999203780255, + "timestamp": "2025-09-04 04:09:32.748816", + "step": 2680, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2680", + "timestamp": "2025-09-04 04:09:33.104178", + "step": 2680, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:33.210287", + "step": 2680, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02347641810774803, + "timestamp": "2025-09-04 04:09:33.232593", + "step": 2681, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:33.327629", + "step": 2681, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019025633111596107, + "timestamp": "2025-09-04 04:09:33.345199", + "step": 2682, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:33.447778", + "step": 2682, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0245287474244833, + "timestamp": "2025-09-04 04:09:33.467127", + "step": 2683, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:33.575344", + "step": 2683, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04559353366494179, + "timestamp": "2025-09-04 04:09:33.596103", + "step": 2684, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:33.689138", + "step": 2684, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.08233796805143356, + "timestamp": "2025-09-04 04:09:33.708433", + "step": 2685, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:09:33.795837", + "step": 2685, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036527463234961033, + "timestamp": "2025-09-04 04:09:33.811533", + "step": 2686, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:33.915424", + "step": 2686, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016142617911100388, + "timestamp": "2025-09-04 04:09:33.934707", + "step": 2687, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:34.014124", + "step": 2687, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029892858117818832, + "timestamp": "2025-09-04 04:09:34.029190", + "step": 2688, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:09:34.099577", + "step": 2688, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009370243176817894, + "timestamp": "2025-09-04 04:09:34.113742", + "step": 2689, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:34.192195", + "step": 2689, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017486093565821648, + "timestamp": "2025-09-04 04:09:34.206344", + "step": 2690, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:34.309269", + "step": 2690, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02338462322950363, + "timestamp": "2025-09-04 04:09:34.328488", + "step": 2691, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:09:34.417191", + "step": 2691, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.048346079885959625, + "timestamp": "2025-09-04 04:09:34.433483", + "step": 2692, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:34.524648", + "step": 2692, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017204314470291138, + "timestamp": "2025-09-04 04:09:34.543518", + "step": 2693, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:09:34.666403", + "step": 2693, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00045050657354295254, + "timestamp": "2025-09-04 04:09:34.689538", + "step": 2694, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:34.774209", + "step": 2694, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01556316763162613, + "timestamp": "2025-09-04 04:09:34.789446", + "step": 2695, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:34.888274", + "step": 2695, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04271606728434563, + "timestamp": "2025-09-04 04:09:34.907744", + "step": 2696, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:09:35.006125", + "step": 2696, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008685296401381493, + "timestamp": "2025-09-04 04:09:35.026917", + "step": 2697, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:09:35.133281", + "step": 2697, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019303993321955204, + "timestamp": "2025-09-04 04:09:35.153384", + "step": 2698, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:35.252519", + "step": 2698, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01800696924328804, + "timestamp": "2025-09-04 04:09:35.271186", + "step": 2699, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:35.365532", + "step": 2699, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01136024296283722, + "timestamp": "2025-09-04 04:09:35.383767", + "step": 2700, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:09:43.928592", + "step": 2700, + "epoch": 3 + }, + { + "type": "pplx", + "content": 313.122110339322, + "timestamp": "2025-09-04 04:09:43.930940", + "step": 2700, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 04:09:44.064080", + "step": 2700, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011454450897872448, + "timestamp": "2025-09-04 04:09:44.092892", + "step": 2701, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:44.185228", + "step": 2701, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031085196882486343, + "timestamp": "2025-09-04 04:09:44.201909", + "step": 2702, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:09:44.304243", + "step": 2702, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006320777349174023, + "timestamp": "2025-09-04 04:09:44.322922", + "step": 2703, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:44.427439", + "step": 2703, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012303713709115982, + "timestamp": "2025-09-04 04:09:44.447172", + "step": 2704, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:09:44.531985", + "step": 2704, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004898655693978071, + "timestamp": "2025-09-04 04:09:44.548628", + "step": 2705, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:09:44.662833", + "step": 2705, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01601606048643589, + "timestamp": "2025-09-04 04:09:44.681951", + "step": 2706, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:09:44.767300", + "step": 2706, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011142881587147713, + "timestamp": "2025-09-04 04:09:44.782310", + "step": 2707, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:09:44.901063", + "step": 2707, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01005838718265295, + "timestamp": "2025-09-04 04:09:44.923827", + "step": 2708, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:45.024411", + "step": 2708, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05936342850327492, + "timestamp": "2025-09-04 04:09:45.044274", + "step": 2709, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:45.156582", + "step": 2709, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008189283311367035, + "timestamp": "2025-09-04 04:09:45.173064", + "step": 2710, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:09:45.285504", + "step": 2710, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.10139759629964828, + "timestamp": "2025-09-04 04:09:45.305622", + "step": 2711, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:45.407014", + "step": 2711, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023838041350245476, + "timestamp": "2025-09-04 04:09:45.426287", + "step": 2712, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:45.519681", + "step": 2712, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02069295570254326, + "timestamp": "2025-09-04 04:09:45.538280", + "step": 2713, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:09:45.616632", + "step": 2713, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.07725197076797485, + "timestamp": "2025-09-04 04:09:45.630199", + "step": 2714, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:45.725539", + "step": 2714, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006542861927300692, + "timestamp": "2025-09-04 04:09:45.742578", + "step": 2715, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:45.847195", + "step": 2715, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03040933422744274, + "timestamp": "2025-09-04 04:09:45.867057", + "step": 2716, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:09:45.961569", + "step": 2716, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011650609085336328, + "timestamp": "2025-09-04 04:09:45.980564", + "step": 2717, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:46.091204", + "step": 2717, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020016556605696678, + "timestamp": "2025-09-04 04:09:46.111056", + "step": 2718, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:46.213969", + "step": 2718, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0162852443754673, + "timestamp": "2025-09-04 04:09:46.231413", + "step": 2719, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:09:46.352029", + "step": 2719, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035490740556269884, + "timestamp": "2025-09-04 04:09:46.374802", + "step": 2720, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:09:54.902522", + "step": 2720, + "epoch": 3 + }, + { + "type": "pplx", + "content": 314.5356204228338, + "timestamp": "2025-09-04 04:09:54.905530", + "step": 2720, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2720", + "timestamp": "2025-09-04 04:09:55.409233", + "step": 2720, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 04:09:55.588923", + "step": 2720, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011699828319251537, + "timestamp": "2025-09-04 04:09:55.626999", + "step": 2721, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:55.710243", + "step": 2721, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008469424210488796, + "timestamp": "2025-09-04 04:09:55.725330", + "step": 2722, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:09:55.843055", + "step": 2722, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005703017581254244, + "timestamp": "2025-09-04 04:09:55.865254", + "step": 2723, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:09:55.940919", + "step": 2723, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06145107373595238, + "timestamp": "2025-09-04 04:09:55.955263", + "step": 2724, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:56.031260", + "step": 2724, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021033862605690956, + "timestamp": "2025-09-04 04:09:56.046734", + "step": 2725, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:09:56.138352", + "step": 2725, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009126712568104267, + "timestamp": "2025-09-04 04:09:56.155147", + "step": 2726, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:09:56.272076", + "step": 2726, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009277377277612686, + "timestamp": "2025-09-04 04:09:56.292646", + "step": 2727, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:09:56.386013", + "step": 2727, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015412485226988792, + "timestamp": "2025-09-04 04:09:56.404221", + "step": 2728, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:09:56.499794", + "step": 2728, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02090812474489212, + "timestamp": "2025-09-04 04:09:56.520175", + "step": 2729, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:09:56.605490", + "step": 2729, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010399392805993557, + "timestamp": "2025-09-04 04:09:56.620588", + "step": 2730, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:09:56.730598", + "step": 2730, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0056745195761322975, + "timestamp": "2025-09-04 04:09:56.750912", + "step": 2731, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:09:56.844525", + "step": 2731, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.13878655433654785, + "timestamp": "2025-09-04 04:09:56.862397", + "step": 2732, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:09:56.938712", + "step": 2732, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016388939693570137, + "timestamp": "2025-09-04 04:09:56.954158", + "step": 2733, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:09:57.053909", + "step": 2733, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011486927047371864, + "timestamp": "2025-09-04 04:09:57.072709", + "step": 2734, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:09:57.189227", + "step": 2734, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007860065437853336, + "timestamp": "2025-09-04 04:09:57.211366", + "step": 2735, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:09:57.322583", + "step": 2735, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013591518625617027, + "timestamp": "2025-09-04 04:09:57.343880", + "step": 2736, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:57.445097", + "step": 2736, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002234839601442218, + "timestamp": "2025-09-04 04:09:57.466151", + "step": 2737, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:57.568464", + "step": 2737, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05005452781915665, + "timestamp": "2025-09-04 04:09:57.585998", + "step": 2738, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:09:57.689780", + "step": 2738, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02009434998035431, + "timestamp": "2025-09-04 04:09:57.708905", + "step": 2739, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:09:57.804873", + "step": 2739, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021923096850514412, + "timestamp": "2025-09-04 04:09:57.823090", + "step": 2740, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:10:06.199426", + "step": 2740, + "epoch": 3 + }, + { + "type": "pplx", + "content": 311.12367018379337, + "timestamp": "2025-09-04 04:10:06.201852", + "step": 2740, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:06.299389", + "step": 2740, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017065318301320076, + "timestamp": "2025-09-04 04:10:06.320394", + "step": 2741, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:06.429479", + "step": 2741, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02739621140062809, + "timestamp": "2025-09-04 04:10:06.449742", + "step": 2742, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:06.557790", + "step": 2742, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004024782683700323, + "timestamp": "2025-09-04 04:10:06.578034", + "step": 2743, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:06.684752", + "step": 2743, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03272821381688118, + "timestamp": "2025-09-04 04:10:06.705438", + "step": 2744, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 04:10:06.833286", + "step": 2744, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.036441512405872345, + "timestamp": "2025-09-04 04:10:06.860279", + "step": 2745, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:06.961146", + "step": 2745, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017346149310469627, + "timestamp": "2025-09-04 04:10:06.979983", + "step": 2746, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:07.086318", + "step": 2746, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008722832426428795, + "timestamp": "2025-09-04 04:10:07.106265", + "step": 2747, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:07.216471", + "step": 2747, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004638466984033585, + "timestamp": "2025-09-04 04:10:07.237228", + "step": 2748, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:10:07.320538", + "step": 2748, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0057872445322573185, + "timestamp": "2025-09-04 04:10:07.337391", + "step": 2749, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:10:07.448267", + "step": 2749, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026939285453408957, + "timestamp": "2025-09-04 04:10:07.468552", + "step": 2750, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:07.570874", + "step": 2750, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036715560127049685, + "timestamp": "2025-09-04 04:10:07.589996", + "step": 2751, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:10:07.671514", + "step": 2751, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037178792990744114, + "timestamp": "2025-09-04 04:10:07.686499", + "step": 2752, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:10:07.777656", + "step": 2752, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027517317794263363, + "timestamp": "2025-09-04 04:10:07.796789", + "step": 2753, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:07.897199", + "step": 2753, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021229060366749763, + "timestamp": "2025-09-04 04:10:07.916168", + "step": 2754, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:10:07.994227", + "step": 2754, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006144124083220959, + "timestamp": "2025-09-04 04:10:08.008433", + "step": 2755, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:08.109566", + "step": 2755, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0041644214652478695, + "timestamp": "2025-09-04 04:10:08.129094", + "step": 2756, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:10:08.239640", + "step": 2756, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018796509131789207, + "timestamp": "2025-09-04 04:10:08.262060", + "step": 2757, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:10:08.379321", + "step": 2757, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009927745908498764, + "timestamp": "2025-09-04 04:10:08.401456", + "step": 2758, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:10:08.511886", + "step": 2758, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0052697621285915375, + "timestamp": "2025-09-04 04:10:08.532136", + "step": 2759, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:08.642566", + "step": 2759, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005806296598166227, + "timestamp": "2025-09-04 04:10:08.663642", + "step": 2760, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:10:17.026885", + "step": 2760, + "epoch": 3 + }, + { + "type": "pplx", + "content": 312.6903928437137, + "timestamp": "2025-09-04 04:10:17.028993", + "step": 2760, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2760", + "timestamp": "2025-09-04 04:10:17.510313", + "step": 2760, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:10:17.606536", + "step": 2760, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03818826749920845, + "timestamp": "2025-09-04 04:10:17.626870", + "step": 2761, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:10:17.708673", + "step": 2761, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004727974068373442, + "timestamp": "2025-09-04 04:10:17.723698", + "step": 2762, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:17.824531", + "step": 2762, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001716333907097578, + "timestamp": "2025-09-04 04:10:17.843370", + "step": 2763, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:10:17.929149", + "step": 2763, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01537768542766571, + "timestamp": "2025-09-04 04:10:17.945052", + "step": 2764, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:18.039295", + "step": 2764, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003431823570281267, + "timestamp": "2025-09-04 04:10:18.058557", + "step": 2765, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:10:18.152151", + "step": 2765, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01357530988752842, + "timestamp": "2025-09-04 04:10:18.168950", + "step": 2766, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:10:18.263993", + "step": 2766, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009554415941238403, + "timestamp": "2025-09-04 04:10:18.281261", + "step": 2767, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:18.386404", + "step": 2767, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04511303827166557, + "timestamp": "2025-09-04 04:10:18.406572", + "step": 2768, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:18.510539", + "step": 2768, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00565410265699029, + "timestamp": "2025-09-04 04:10:18.532370", + "step": 2769, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:18.627990", + "step": 2769, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01116864662617445, + "timestamp": "2025-09-04 04:10:18.645617", + "step": 2770, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:10:18.717805", + "step": 2770, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024722347036004066, + "timestamp": "2025-09-04 04:10:18.730823", + "step": 2771, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:18.831877", + "step": 2771, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028597375378012657, + "timestamp": "2025-09-04 04:10:18.851318", + "step": 2772, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:18.929289", + "step": 2772, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017174814129248261, + "timestamp": "2025-09-04 04:10:18.944338", + "step": 2773, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:19.054046", + "step": 2773, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018303746357560158, + "timestamp": "2025-09-04 04:10:19.074366", + "step": 2774, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:19.182654", + "step": 2774, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005763411987572908, + "timestamp": "2025-09-04 04:10:19.202145", + "step": 2775, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:10:19.290334", + "step": 2775, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.08446403592824936, + "timestamp": "2025-09-04 04:10:19.306205", + "step": 2776, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:19.405065", + "step": 2776, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02213035710155964, + "timestamp": "2025-09-04 04:10:19.425185", + "step": 2777, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:19.528569", + "step": 2777, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00788130797445774, + "timestamp": "2025-09-04 04:10:19.547055", + "step": 2778, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:10:19.630529", + "step": 2778, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002031184732913971, + "timestamp": "2025-09-04 04:10:19.645671", + "step": 2779, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:19.755409", + "step": 2779, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032860611099749804, + "timestamp": "2025-09-04 04:10:19.776604", + "step": 2780, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:10:28.279651", + "step": 2780, + "epoch": 3 + }, + { + "type": "pplx", + "content": 315.0957572527015, + "timestamp": "2025-09-04 04:10:28.282924", + "step": 2780, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:28.358024", + "step": 2780, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020035451278090477, + "timestamp": "2025-09-04 04:10:28.373018", + "step": 2781, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:28.481511", + "step": 2781, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019282890483736992, + "timestamp": "2025-09-04 04:10:28.501835", + "step": 2782, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:28.605645", + "step": 2782, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053369090892374516, + "timestamp": "2025-09-04 04:10:28.624855", + "step": 2783, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:28.702196", + "step": 2783, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003618256188929081, + "timestamp": "2025-09-04 04:10:28.717006", + "step": 2784, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:10:28.833789", + "step": 2784, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03532740846276283, + "timestamp": "2025-09-04 04:10:28.857626", + "step": 2785, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:10:28.957073", + "step": 2785, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014763697050511837, + "timestamp": "2025-09-04 04:10:28.975683", + "step": 2786, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:29.082579", + "step": 2786, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05285456404089928, + "timestamp": "2025-09-04 04:10:29.102524", + "step": 2787, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:29.209574", + "step": 2787, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01331684272736311, + "timestamp": "2025-09-04 04:10:29.230075", + "step": 2788, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:29.306219", + "step": 2788, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03862687200307846, + "timestamp": "2025-09-04 04:10:29.321566", + "step": 2789, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:10:29.405360", + "step": 2789, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027003700379282236, + "timestamp": "2025-09-04 04:10:29.420505", + "step": 2790, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:10:29.513961", + "step": 2790, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024312736466526985, + "timestamp": "2025-09-04 04:10:29.531382", + "step": 2791, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:10:29.641310", + "step": 2791, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009408136829733849, + "timestamp": "2025-09-04 04:10:29.662400", + "step": 2792, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:10:29.770535", + "step": 2792, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008958464488387108, + "timestamp": "2025-09-04 04:10:29.793068", + "step": 2793, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:29.900709", + "step": 2793, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011589645873755217, + "timestamp": "2025-09-04 04:10:29.920749", + "step": 2794, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:30.016347", + "step": 2794, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00341598829254508, + "timestamp": "2025-09-04 04:10:30.033834", + "step": 2795, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:30.111178", + "step": 2795, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012302583083510399, + "timestamp": "2025-09-04 04:10:30.125938", + "step": 2796, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:30.224337", + "step": 2796, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017612578347325325, + "timestamp": "2025-09-04 04:10:30.245067", + "step": 2797, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:30.339629", + "step": 2797, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007235830184072256, + "timestamp": "2025-09-04 04:10:30.357101", + "step": 2798, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:10:30.450764", + "step": 2798, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0061172679997980595, + "timestamp": "2025-09-04 04:10:30.468141", + "step": 2799, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:10:30.579493", + "step": 2799, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001860892865806818, + "timestamp": "2025-09-04 04:10:30.600698", + "step": 2800, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:10:39.011398", + "step": 2800, + "epoch": 3 + }, + { + "type": "pplx", + "content": 318.34111266257355, + "timestamp": "2025-09-04 04:10:39.013794", + "step": 2800, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2800", + "timestamp": "2025-09-04 04:10:39.393827", + "step": 2800, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:10:39.474968", + "step": 2800, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.10843028128147125, + "timestamp": "2025-09-04 04:10:39.491863", + "step": 2801, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:39.595028", + "step": 2801, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002566551323980093, + "timestamp": "2025-09-04 04:10:39.614327", + "step": 2802, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:10:39.708227", + "step": 2802, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010621008463203907, + "timestamp": "2025-09-04 04:10:39.725450", + "step": 2803, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:10:39.803350", + "step": 2803, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015502018854022026, + "timestamp": "2025-09-04 04:10:39.818407", + "step": 2804, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:39.918618", + "step": 2804, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00420707743614912, + "timestamp": "2025-09-04 04:10:39.939782", + "step": 2805, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:40.032227", + "step": 2805, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02263396605849266, + "timestamp": "2025-09-04 04:10:40.046253", + "step": 2806, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:10:40.124234", + "step": 2806, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0052959551103413105, + "timestamp": "2025-09-04 04:10:40.138380", + "step": 2807, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:40.242077", + "step": 2807, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015647169202566147, + "timestamp": "2025-09-04 04:10:40.262075", + "step": 2808, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:40.369856", + "step": 2808, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023905981797724962, + "timestamp": "2025-09-04 04:10:40.392110", + "step": 2809, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:40.500448", + "step": 2809, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002249652286991477, + "timestamp": "2025-09-04 04:10:40.520444", + "step": 2810, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:40.624725", + "step": 2810, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006803617812693119, + "timestamp": "2025-09-04 04:10:40.644151", + "step": 2811, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:40.748833", + "step": 2811, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021052875090390444, + "timestamp": "2025-09-04 04:10:40.768933", + "step": 2812, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:40.869181", + "step": 2812, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004011242184787989, + "timestamp": "2025-09-04 04:10:40.890221", + "step": 2813, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 04:10:41.095710", + "step": 2813, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032255747355520725, + "timestamp": "2025-09-04 04:10:41.134881", + "step": 2814, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:41.212690", + "step": 2814, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025714444927871227, + "timestamp": "2025-09-04 04:10:41.226621", + "step": 2815, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:41.327667", + "step": 2815, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013107407838106155, + "timestamp": "2025-09-04 04:10:41.347348", + "step": 2816, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:10:41.444093", + "step": 2816, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025292744394391775, + "timestamp": "2025-09-04 04:10:41.464580", + "step": 2817, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:41.566996", + "step": 2817, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019604351371526718, + "timestamp": "2025-09-04 04:10:41.586193", + "step": 2818, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:10:41.685731", + "step": 2818, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006207880564033985, + "timestamp": "2025-09-04 04:10:41.704494", + "step": 2819, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:10:41.804185", + "step": 2819, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0022927229292690754, + "timestamp": "2025-09-04 04:10:41.823554", + "step": 2820, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:10:50.340911", + "step": 2820, + "epoch": 3 + }, + { + "type": "pplx", + "content": 319.52629439814444, + "timestamp": "2025-09-04 04:10:50.346919", + "step": 2820, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:50.421114", + "step": 2820, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014415581244975328, + "timestamp": "2025-09-04 04:10:50.436450", + "step": 2821, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:50.513386", + "step": 2821, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.039860036224126816, + "timestamp": "2025-09-04 04:10:50.527474", + "step": 2822, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:10:50.638203", + "step": 2822, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04684140905737877, + "timestamp": "2025-09-04 04:10:50.658808", + "step": 2823, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:10:50.751914", + "step": 2823, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005943602416664362, + "timestamp": "2025-09-04 04:10:50.769856", + "step": 2824, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:50.868293", + "step": 2824, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006049733608961105, + "timestamp": "2025-09-04 04:10:50.888798", + "step": 2825, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:50.988731", + "step": 2825, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014789201086387038, + "timestamp": "2025-09-04 04:10:51.007670", + "step": 2826, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 04:10:51.233431", + "step": 2826, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029814256355166435, + "timestamp": "2025-09-04 04:10:51.275647", + "step": 2827, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:10:51.380010", + "step": 2827, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004776179324835539, + "timestamp": "2025-09-04 04:10:51.400069", + "step": 2828, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:10:51.476052", + "step": 2828, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025096198543906212, + "timestamp": "2025-09-04 04:10:51.491170", + "step": 2829, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1200 + ], + "flops": 24000145761984.0 + }, + "timestamp": "2025-09-04 04:10:51.666720", + "step": 2829, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008539275149814785, + "timestamp": "2025-09-04 04:10:51.699819", + "step": 2830, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:51.794605", + "step": 2830, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.039075467735528946, + "timestamp": "2025-09-04 04:10:51.812176", + "step": 2831, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:10:51.927654", + "step": 2831, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002711005974560976, + "timestamp": "2025-09-04 04:10:51.948514", + "step": 2832, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:10:52.054122", + "step": 2832, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01790589839220047, + "timestamp": "2025-09-04 04:10:52.076419", + "step": 2833, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:52.177333", + "step": 2833, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06275072693824768, + "timestamp": "2025-09-04 04:10:52.196207", + "step": 2834, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:52.300946", + "step": 2834, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004898981656879187, + "timestamp": "2025-09-04 04:10:52.320225", + "step": 2835, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:10:52.532531", + "step": 2835, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009175102226436138, + "timestamp": "2025-09-04 04:10:52.572401", + "step": 2836, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:10:52.665860", + "step": 2836, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009001928381621838, + "timestamp": "2025-09-04 04:10:52.685175", + "step": 2837, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:10:52.768630", + "step": 2837, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0046532354317605495, + "timestamp": "2025-09-04 04:10:52.783757", + "step": 2838, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:10:52.887118", + "step": 2838, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010329126380383968, + "timestamp": "2025-09-04 04:10:52.906475", + "step": 2839, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:10:53.005841", + "step": 2839, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010225264355540276, + "timestamp": "2025-09-04 04:10:53.025440", + "step": 2840, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:01.520893", + "step": 2840, + "epoch": 3 + }, + { + "type": "pplx", + "content": 325.51343438767867, + "timestamp": "2025-09-04 04:11:01.522859", + "step": 2840, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2840", + "timestamp": "2025-09-04 04:11:01.898939", + "step": 2840, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:02.001221", + "step": 2840, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01796550862491131, + "timestamp": "2025-09-04 04:11:02.022491", + "step": 2841, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:11:02.125380", + "step": 2841, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005674062762409449, + "timestamp": "2025-09-04 04:11:02.144648", + "step": 2842, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:11:02.266925", + "step": 2842, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01859263703227043, + "timestamp": "2025-09-04 04:11:02.290243", + "step": 2843, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:02.397530", + "step": 2843, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015665870159864426, + "timestamp": "2025-09-04 04:11:02.418251", + "step": 2844, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:11:02.527583", + "step": 2844, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012087870389223099, + "timestamp": "2025-09-04 04:11:02.550254", + "step": 2845, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:02.659541", + "step": 2845, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020001424476504326, + "timestamp": "2025-09-04 04:11:02.679920", + "step": 2846, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:11:02.756649", + "step": 2846, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028880124911665916, + "timestamp": "2025-09-04 04:11:02.770460", + "step": 2847, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 04:11:02.907840", + "step": 2847, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05806123465299606, + "timestamp": "2025-09-04 04:11:02.934843", + "step": 2848, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:11:03.012846", + "step": 2848, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024739542976021767, + "timestamp": "2025-09-04 04:11:03.028248", + "step": 2849, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:03.132752", + "step": 2849, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01195996068418026, + "timestamp": "2025-09-04 04:11:03.151984", + "step": 2850, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:03.252141", + "step": 2850, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002954959636554122, + "timestamp": "2025-09-04 04:11:03.271134", + "step": 2851, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:03.381295", + "step": 2851, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03290526196360588, + "timestamp": "2025-09-04 04:11:03.402381", + "step": 2852, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:03.503914", + "step": 2852, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018533534603193402, + "timestamp": "2025-09-04 04:11:03.525056", + "step": 2853, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:03.629515", + "step": 2853, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006066088564693928, + "timestamp": "2025-09-04 04:11:03.648528", + "step": 2854, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 04:11:03.822330", + "step": 2854, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006023372989147902, + "timestamp": "2025-09-04 04:11:03.855018", + "step": 2855, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:11:03.941618", + "step": 2855, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012206432409584522, + "timestamp": "2025-09-04 04:11:03.957991", + "step": 2856, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:04.049387", + "step": 2856, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011448818258941174, + "timestamp": "2025-09-04 04:11:04.068480", + "step": 2857, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:04.161673", + "step": 2857, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008118141442537308, + "timestamp": "2025-09-04 04:11:04.178779", + "step": 2858, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:04.271839", + "step": 2858, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008240415714681149, + "timestamp": "2025-09-04 04:11:04.288945", + "step": 2859, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:04.389798", + "step": 2859, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023680399172008038, + "timestamp": "2025-09-04 04:11:04.409541", + "step": 2860, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:12.908013", + "step": 2860, + "epoch": 3 + }, + { + "type": "pplx", + "content": 331.32267155954526, + "timestamp": "2025-09-04 04:11:12.910564", + "step": 2860, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:13.014712", + "step": 2860, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0046889204531908035, + "timestamp": "2025-09-04 04:11:13.037029", + "step": 2861, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:13.136629", + "step": 2861, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016678936779499054, + "timestamp": "2025-09-04 04:11:13.155187", + "step": 2862, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:13.255837", + "step": 2862, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009306335123255849, + "timestamp": "2025-09-04 04:11:13.274539", + "step": 2863, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:11:13.384828", + "step": 2863, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003980133216828108, + "timestamp": "2025-09-04 04:11:13.406117", + "step": 2864, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:13.501527", + "step": 2864, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005760484491474926, + "timestamp": "2025-09-04 04:11:13.520434", + "step": 2865, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:11:13.631198", + "step": 2865, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011027660220861435, + "timestamp": "2025-09-04 04:11:13.651783", + "step": 2866, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:11:13.735521", + "step": 2866, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009353280998766422, + "timestamp": "2025-09-04 04:11:13.750541", + "step": 2867, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:11:13.852996", + "step": 2867, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00644304882735014, + "timestamp": "2025-09-04 04:11:13.873074", + "step": 2868, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:11:14.072568", + "step": 2868, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026496384292840958, + "timestamp": "2025-09-04 04:11:14.115279", + "step": 2869, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:11:14.233382", + "step": 2869, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003224861342459917, + "timestamp": "2025-09-04 04:11:14.255497", + "step": 2870, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:14.351529", + "step": 2870, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011617367155849934, + "timestamp": "2025-09-04 04:11:14.368918", + "step": 2871, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:14.476955", + "step": 2871, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018545642495155334, + "timestamp": "2025-09-04 04:11:14.498148", + "step": 2872, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:14.601865", + "step": 2872, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030028163455426693, + "timestamp": "2025-09-04 04:11:14.623707", + "step": 2873, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:11:14.759811", + "step": 2873, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005331755965016782, + "timestamp": "2025-09-04 04:11:14.785767", + "step": 2874, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:11:14.871281", + "step": 2874, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021123895421624184, + "timestamp": "2025-09-04 04:11:14.886891", + "step": 2875, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:11:14.977528", + "step": 2875, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001793418894521892, + "timestamp": "2025-09-04 04:11:14.995043", + "step": 2876, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:15.091817", + "step": 2876, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005147572606801987, + "timestamp": "2025-09-04 04:11:15.112275", + "step": 2877, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:11:15.195551", + "step": 2877, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.042930182069540024, + "timestamp": "2025-09-04 04:11:15.210752", + "step": 2878, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:15.303265", + "step": 2878, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008315377868711948, + "timestamp": "2025-09-04 04:11:15.320401", + "step": 2879, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:11:15.391797", + "step": 2879, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0415649451315403, + "timestamp": "2025-09-04 04:11:15.405539", + "step": 2880, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:23.878254", + "step": 2880, + "epoch": 3 + }, + { + "type": "pplx", + "content": 334.28353166069746, + "timestamp": "2025-09-04 04:11:23.880247", + "step": 2880, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2880", + "timestamp": "2025-09-04 04:11:24.404237", + "step": 2880, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:11:24.478633", + "step": 2880, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015389709733426571, + "timestamp": "2025-09-04 04:11:24.493617", + "step": 2881, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:11:24.570800", + "step": 2881, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012384703382849693, + "timestamp": "2025-09-04 04:11:24.584952", + "step": 2882, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:24.680140", + "step": 2882, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006399277481250465, + "timestamp": "2025-09-04 04:11:24.697690", + "step": 2883, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:24.791237", + "step": 2883, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020068077370524406, + "timestamp": "2025-09-04 04:11:24.809296", + "step": 2884, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:24.906372", + "step": 2884, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006495574954897165, + "timestamp": "2025-09-04 04:11:24.926876", + "step": 2885, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:11:25.043941", + "step": 2885, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03137728571891785, + "timestamp": "2025-09-04 04:11:25.066254", + "step": 2886, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:25.159725", + "step": 2886, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01227316539734602, + "timestamp": "2025-09-04 04:11:25.177117", + "step": 2887, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 04:11:25.315117", + "step": 2887, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028199530206620693, + "timestamp": "2025-09-04 04:11:25.342043", + "step": 2888, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:25.432237", + "step": 2888, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009654730558395386, + "timestamp": "2025-09-04 04:11:25.451127", + "step": 2889, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:25.547980", + "step": 2889, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02699156291782856, + "timestamp": "2025-09-04 04:11:25.565553", + "step": 2890, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:25.666229", + "step": 2890, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014877088367938995, + "timestamp": "2025-09-04 04:11:25.684974", + "step": 2891, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:11:25.764195", + "step": 2891, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.041199635714292526, + "timestamp": "2025-09-04 04:11:25.779178", + "step": 2892, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:25.877779", + "step": 2892, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04918312653899193, + "timestamp": "2025-09-04 04:11:25.898518", + "step": 2893, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:11:25.969548", + "step": 2893, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04640977457165718, + "timestamp": "2025-09-04 04:11:25.982360", + "step": 2894, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:26.091334", + "step": 2894, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011765974573791027, + "timestamp": "2025-09-04 04:11:26.111741", + "step": 2895, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:26.211303", + "step": 2895, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004320243373513222, + "timestamp": "2025-09-04 04:11:26.230665", + "step": 2896, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:11:26.313778", + "step": 2896, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003690729383379221, + "timestamp": "2025-09-04 04:11:26.330762", + "step": 2897, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:26.427045", + "step": 2897, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005680757109075785, + "timestamp": "2025-09-04 04:11:26.444663", + "step": 2898, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:26.545200", + "step": 2898, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033540872391313314, + "timestamp": "2025-09-04 04:11:26.564186", + "step": 2899, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:11:26.679234", + "step": 2899, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008547638542950153, + "timestamp": "2025-09-04 04:11:26.700432", + "step": 2900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:35.169803", + "step": 2900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 331.12824914190674, + "timestamp": "2025-09-04 04:11:35.172146", + "step": 2900, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:35.267283", + "step": 2900, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010944758541882038, + "timestamp": "2025-09-04 04:11:35.287673", + "step": 2901, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:11:35.363842", + "step": 2901, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016356654465198517, + "timestamp": "2025-09-04 04:11:35.377690", + "step": 2902, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:35.471661", + "step": 2902, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011579522397369146, + "timestamp": "2025-09-04 04:11:35.489033", + "step": 2903, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:35.600157", + "step": 2903, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0319959782063961, + "timestamp": "2025-09-04 04:11:35.620991", + "step": 2904, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:35.721187", + "step": 2904, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000950302230194211, + "timestamp": "2025-09-04 04:11:35.742430", + "step": 2905, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:35.848848", + "step": 2905, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001381176058202982, + "timestamp": "2025-09-04 04:11:35.868867", + "step": 2906, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:11:35.958637", + "step": 2906, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023815217427909374, + "timestamp": "2025-09-04 04:11:35.975448", + "step": 2907, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:36.069679", + "step": 2907, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01757667027413845, + "timestamp": "2025-09-04 04:11:36.087672", + "step": 2908, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:36.188807", + "step": 2908, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0278801117092371, + "timestamp": "2025-09-04 04:11:36.210007", + "step": 2909, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:11:36.285728", + "step": 2909, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01353493519127369, + "timestamp": "2025-09-04 04:11:36.299222", + "step": 2910, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:11:36.401642", + "step": 2910, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04371942579746246, + "timestamp": "2025-09-04 04:11:36.420836", + "step": 2911, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:36.521093", + "step": 2911, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0061318958178162575, + "timestamp": "2025-09-04 04:11:36.540465", + "step": 2912, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:11:36.624627", + "step": 2912, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.057585082948207855, + "timestamp": "2025-09-04 04:11:36.641737", + "step": 2913, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:36.745445", + "step": 2913, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021131334360688925, + "timestamp": "2025-09-04 04:11:36.763965", + "step": 2914, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:11:36.848566", + "step": 2914, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01308556366711855, + "timestamp": "2025-09-04 04:11:36.863954", + "step": 2915, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:11:36.987369", + "step": 2915, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034837471321225166, + "timestamp": "2025-09-04 04:11:37.011142", + "step": 2916, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:37.105198", + "step": 2916, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005953749641776085, + "timestamp": "2025-09-04 04:11:37.124172", + "step": 2917, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:37.229787", + "step": 2917, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005551936570554972, + "timestamp": "2025-09-04 04:11:37.249736", + "step": 2918, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:37.356024", + "step": 2918, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008271034806966782, + "timestamp": "2025-09-04 04:11:37.376070", + "step": 2919, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:11:37.510820", + "step": 2919, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01085783913731575, + "timestamp": "2025-09-04 04:11:37.537414", + "step": 2920, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:46.026493", + "step": 2920, + "epoch": 3 + }, + { + "type": "pplx", + "content": 321.86002245296936, + "timestamp": "2025-09-04 04:11:46.028695", + "step": 2920, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2920", + "timestamp": "2025-09-04 04:11:46.382176", + "step": 2920, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:11:46.500520", + "step": 2920, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0059812976978719234, + "timestamp": "2025-09-04 04:11:46.525777", + "step": 2921, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:46.631419", + "step": 2921, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009687381098046899, + "timestamp": "2025-09-04 04:11:46.648895", + "step": 2922, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:11:46.784826", + "step": 2922, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005854323972016573, + "timestamp": "2025-09-04 04:11:46.810927", + "step": 2923, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:46.910290", + "step": 2923, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023817606270313263, + "timestamp": "2025-09-04 04:11:46.929762", + "step": 2924, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:11:47.036718", + "step": 2924, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006292128004133701, + "timestamp": "2025-09-04 04:11:47.059290", + "step": 2925, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:11:47.161720", + "step": 2925, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001953437924385071, + "timestamp": "2025-09-04 04:11:47.181032", + "step": 2926, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:11:47.257446", + "step": 2926, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00025311694480478764, + "timestamp": "2025-09-04 04:11:47.271151", + "step": 2927, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:47.364263", + "step": 2927, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01116481889039278, + "timestamp": "2025-09-04 04:11:47.382181", + "step": 2928, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:11:47.474507", + "step": 2928, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016344843432307243, + "timestamp": "2025-09-04 04:11:47.493520", + "step": 2929, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:11:47.587785", + "step": 2929, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0564613938331604, + "timestamp": "2025-09-04 04:11:47.605044", + "step": 2930, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:47.707478", + "step": 2930, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006602128501981497, + "timestamp": "2025-09-04 04:11:47.726448", + "step": 2931, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:11:47.804329", + "step": 2931, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00032434234162792563, + "timestamp": "2025-09-04 04:11:47.819267", + "step": 2932, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:11:47.911829", + "step": 2932, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011728801764547825, + "timestamp": "2025-09-04 04:11:47.931177", + "step": 2933, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:11:48.008311", + "step": 2933, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03661501035094261, + "timestamp": "2025-09-04 04:11:48.022294", + "step": 2934, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:48.126565", + "step": 2934, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005506326910108328, + "timestamp": "2025-09-04 04:11:48.145965", + "step": 2935, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:11:48.248985", + "step": 2935, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007380081806331873, + "timestamp": "2025-09-04 04:11:48.268936", + "step": 2936, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:11:48.356618", + "step": 2936, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023367907851934433, + "timestamp": "2025-09-04 04:11:48.375066", + "step": 2937, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:11:48.484782", + "step": 2937, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.041043538600206375, + "timestamp": "2025-09-04 04:11:48.505538", + "step": 2938, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:48.607041", + "step": 2938, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014860640279948711, + "timestamp": "2025-09-04 04:11:48.625778", + "step": 2939, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:11:48.712747", + "step": 2939, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05614135414361954, + "timestamp": "2025-09-04 04:11:48.729282", + "step": 2940, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:11:57.219177", + "step": 2940, + "epoch": 3 + }, + { + "type": "pplx", + "content": 315.5265659498303, + "timestamp": "2025-09-04 04:11:57.221252", + "step": 2940, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:11:57.296365", + "step": 2940, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004779836628586054, + "timestamp": "2025-09-04 04:11:57.311775", + "step": 2941, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:11:57.386172", + "step": 2941, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024101873859763145, + "timestamp": "2025-09-04 04:11:57.399796", + "step": 2942, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:57.506945", + "step": 2942, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011341488920152187, + "timestamp": "2025-09-04 04:11:57.527061", + "step": 2943, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:11:57.637526", + "step": 2943, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022627878934144974, + "timestamp": "2025-09-04 04:11:57.658821", + "step": 2944, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:57.756279", + "step": 2944, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01583726704120636, + "timestamp": "2025-09-04 04:11:57.777101", + "step": 2945, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:11:57.912462", + "step": 2945, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011284503154456615, + "timestamp": "2025-09-04 04:11:57.938532", + "step": 2946, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:58.041921", + "step": 2946, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01882072165608406, + "timestamp": "2025-09-04 04:11:58.061301", + "step": 2947, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:11:58.145850", + "step": 2947, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01440991461277008, + "timestamp": "2025-09-04 04:11:58.159707", + "step": 2948, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:11:58.256671", + "step": 2948, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026601549237966537, + "timestamp": "2025-09-04 04:11:58.277139", + "step": 2949, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:11:58.380822", + "step": 2949, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03433069586753845, + "timestamp": "2025-09-04 04:11:58.400182", + "step": 2950, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:11:58.476174", + "step": 2950, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034103342331945896, + "timestamp": "2025-09-04 04:11:58.489905", + "step": 2951, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:11:58.572803", + "step": 2951, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005983210634440184, + "timestamp": "2025-09-04 04:11:58.588627", + "step": 2952, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:58.690559", + "step": 2952, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016223527491092682, + "timestamp": "2025-09-04 04:11:58.711211", + "step": 2953, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:11:58.798428", + "step": 2953, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005183096043765545, + "timestamp": "2025-09-04 04:11:58.814156", + "step": 2954, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:11:58.900792", + "step": 2954, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.041471634060144424, + "timestamp": "2025-09-04 04:11:58.916498", + "step": 2955, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:11:59.026318", + "step": 2955, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007432362181134522, + "timestamp": "2025-09-04 04:11:59.046046", + "step": 2956, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:11:59.165833", + "step": 2956, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010432606562972069, + "timestamp": "2025-09-04 04:11:59.191418", + "step": 2957, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:11:59.270172", + "step": 2957, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015087028034031391, + "timestamp": "2025-09-04 04:11:59.284449", + "step": 2958, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:11:59.392745", + "step": 2958, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005746278329752386, + "timestamp": "2025-09-04 04:11:59.412683", + "step": 2959, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 04:11:59.546824", + "step": 2959, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005405406001955271, + "timestamp": "2025-09-04 04:11:59.571566", + "step": 2960, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:12:08.056684", + "step": 2960, + "epoch": 3 + }, + { + "type": "pplx", + "content": 312.525467653901, + "timestamp": "2025-09-04 04:12:08.058678", + "step": 2960, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 2960", + "timestamp": "2025-09-04 04:12:08.564968", + "step": 2960, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:12:08.639868", + "step": 2960, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011480286717414856, + "timestamp": "2025-09-04 04:12:08.654576", + "step": 2961, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:12:08.756540", + "step": 2961, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01124604418873787, + "timestamp": "2025-09-04 04:12:08.775864", + "step": 2962, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:08.876374", + "step": 2962, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005138123407959938, + "timestamp": "2025-09-04 04:12:08.895235", + "step": 2963, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:12:09.005049", + "step": 2963, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007659485097974539, + "timestamp": "2025-09-04 04:12:09.026215", + "step": 2964, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:12:09.134391", + "step": 2964, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02365208975970745, + "timestamp": "2025-09-04 04:12:09.157039", + "step": 2965, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:09.250194", + "step": 2965, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0039781988598406315, + "timestamp": "2025-09-04 04:12:09.267441", + "step": 2966, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:12:09.379255", + "step": 2966, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009690428851172328, + "timestamp": "2025-09-04 04:12:09.399917", + "step": 2967, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:09.495805", + "step": 2967, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01487466599792242, + "timestamp": "2025-09-04 04:12:09.514085", + "step": 2968, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:12:09.614684", + "step": 2968, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.08440519124269485, + "timestamp": "2025-09-04 04:12:09.635750", + "step": 2969, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:09.734830", + "step": 2969, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020220091566443443, + "timestamp": "2025-09-04 04:12:09.753528", + "step": 2970, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:09.852354", + "step": 2970, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010999851860105991, + "timestamp": "2025-09-04 04:12:09.871044", + "step": 2971, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:12:09.956995", + "step": 2971, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022553278133273125, + "timestamp": "2025-09-04 04:12:09.973369", + "step": 2972, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:12:10.044334", + "step": 2972, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029222038574516773, + "timestamp": "2025-09-04 04:12:10.058568", + "step": 2973, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:10.162220", + "step": 2973, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015705617144703865, + "timestamp": "2025-09-04 04:12:10.181499", + "step": 2974, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 04:12:10.308836", + "step": 2974, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012157633900642395, + "timestamp": "2025-09-04 04:12:10.333367", + "step": 2975, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1040 + ], + "flops": 20800126336064.0 + }, + "timestamp": "2025-09-04 04:12:10.486121", + "step": 2975, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016581635922193527, + "timestamp": "2025-09-04 04:12:10.516149", + "step": 2976, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:12:10.617030", + "step": 2976, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012151544913649559, + "timestamp": "2025-09-04 04:12:10.638199", + "step": 2977, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:12:10.717488", + "step": 2977, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00917992927134037, + "timestamp": "2025-09-04 04:12:10.731631", + "step": 2978, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:10.836079", + "step": 2978, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002120188670232892, + "timestamp": "2025-09-04 04:12:10.855226", + "step": 2979, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:10.948080", + "step": 2979, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006955720018595457, + "timestamp": "2025-09-04 04:12:10.965588", + "step": 2980, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:12:19.458344", + "step": 2980, + "epoch": 3 + }, + { + "type": "pplx", + "content": 315.1214843451217, + "timestamp": "2025-09-04 04:12:19.460879", + "step": 2980, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:19.548069", + "step": 2980, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003038703231140971, + "timestamp": "2025-09-04 04:12:19.566567", + "step": 2981, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:19.678102", + "step": 2981, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017276513390243053, + "timestamp": "2025-09-04 04:12:19.698535", + "step": 2982, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:19.803796", + "step": 2982, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019167335703969002, + "timestamp": "2025-09-04 04:12:19.823105", + "step": 2983, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:19.913285", + "step": 2983, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004277768079191446, + "timestamp": "2025-09-04 04:12:19.930819", + "step": 2984, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:20.021819", + "step": 2984, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011646011844277382, + "timestamp": "2025-09-04 04:12:20.040607", + "step": 2985, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:12:20.146162", + "step": 2985, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010585073614493012, + "timestamp": "2025-09-04 04:12:20.166184", + "step": 2986, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:20.259593", + "step": 2986, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024997970089316368, + "timestamp": "2025-09-04 04:12:20.276766", + "step": 2987, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:12:20.354448", + "step": 2987, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011168187484145164, + "timestamp": "2025-09-04 04:12:20.369389", + "step": 2988, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:20.470109", + "step": 2988, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004837450571358204, + "timestamp": "2025-09-04 04:12:20.491265", + "step": 2989, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:12:20.568690", + "step": 2989, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02616807632148266, + "timestamp": "2025-09-04 04:12:20.582758", + "step": 2990, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:12:20.660612", + "step": 2990, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006141870282590389, + "timestamp": "2025-09-04 04:12:20.674607", + "step": 2991, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:12:20.760305", + "step": 2991, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006691917777061462, + "timestamp": "2025-09-04 04:12:20.776741", + "step": 2992, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:20.873019", + "step": 2992, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005890341941267252, + "timestamp": "2025-09-04 04:12:20.893402", + "step": 2993, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:20.986981", + "step": 2993, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004606809816323221, + "timestamp": "2025-09-04 04:12:21.004539", + "step": 2994, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:21.107947", + "step": 2994, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004719878546893597, + "timestamp": "2025-09-04 04:12:21.127136", + "step": 2995, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:12:21.237919", + "step": 2995, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019325355533510447, + "timestamp": "2025-09-04 04:12:21.259351", + "step": 2996, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:21.355991", + "step": 2996, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0065997205674648285, + "timestamp": "2025-09-04 04:12:21.376493", + "step": 2997, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:21.466283", + "step": 2997, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02511041797697544, + "timestamp": "2025-09-04 04:12:21.483091", + "step": 2998, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:21.585402", + "step": 2998, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015789009630680084, + "timestamp": "2025-09-04 04:12:21.604698", + "step": 2999, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:21.703899", + "step": 2999, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010090545751154423, + "timestamp": "2025-09-04 04:12:21.723385", + "step": 3000, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:12:30.115857", + "step": 3000, + "epoch": 3 + }, + { + "type": "pplx", + "content": 318.48539959590164, + "timestamp": "2025-09-04 04:12:30.117830", + "step": 3000, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3000", + "timestamp": "2025-09-04 04:12:30.470352", + "step": 3000, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:12:30.569127", + "step": 3000, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0071251485496759415, + "timestamp": "2025-09-04 04:12:30.589877", + "step": 3001, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:30.682094", + "step": 3001, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00022971679572947323, + "timestamp": "2025-09-04 04:12:30.699269", + "step": 3002, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:12:30.784778", + "step": 3002, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013213549740612507, + "timestamp": "2025-09-04 04:12:30.800298", + "step": 3003, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:12:30.872344", + "step": 3003, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002448985120281577, + "timestamp": "2025-09-04 04:12:30.886091", + "step": 3004, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:30.976093", + "step": 3004, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00946701131761074, + "timestamp": "2025-09-04 04:12:30.994794", + "step": 3005, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:12:31.079497", + "step": 3005, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027095887809991837, + "timestamp": "2025-09-04 04:12:31.095018", + "step": 3006, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:31.188994", + "step": 3006, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011162908747792244, + "timestamp": "2025-09-04 04:12:31.206237", + "step": 3007, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:12:31.281961", + "step": 3007, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002971187699586153, + "timestamp": "2025-09-04 04:12:31.296302", + "step": 3008, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:31.401735", + "step": 3008, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016095120226964355, + "timestamp": "2025-09-04 04:12:31.424309", + "step": 3009, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:12:31.531150", + "step": 3009, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003574864531401545, + "timestamp": "2025-09-04 04:12:31.551257", + "step": 3010, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:31.644083", + "step": 3010, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05357489734888077, + "timestamp": "2025-09-04 04:12:31.660973", + "step": 3011, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:31.762905", + "step": 3011, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025260241702198982, + "timestamp": "2025-09-04 04:12:31.781102", + "step": 3012, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:12:31.895906", + "step": 3012, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002299356274306774, + "timestamp": "2025-09-04 04:12:31.920205", + "step": 3013, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:32.012619", + "step": 3013, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005653650965541601, + "timestamp": "2025-09-04 04:12:32.029774", + "step": 3014, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:32.138215", + "step": 3014, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004577314481139183, + "timestamp": "2025-09-04 04:12:32.157429", + "step": 3015, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:12:32.234613", + "step": 3015, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007822668179869652, + "timestamp": "2025-09-04 04:12:32.249408", + "step": 3016, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:32.349594", + "step": 3016, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035554529167711735, + "timestamp": "2025-09-04 04:12:32.370303", + "step": 3017, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:12:32.446897", + "step": 3017, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.045765411108732224, + "timestamp": "2025-09-04 04:12:32.460693", + "step": 3018, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:12:32.597497", + "step": 3018, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015078384894877672, + "timestamp": "2025-09-04 04:12:32.623398", + "step": 3019, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:32.724410", + "step": 3019, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004389840178191662, + "timestamp": "2025-09-04 04:12:32.743531", + "step": 3020, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:12:41.245228", + "step": 3020, + "epoch": 3 + }, + { + "type": "pplx", + "content": 316.8819015781872, + "timestamp": "2025-09-04 04:12:41.247591", + "step": 3020, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:41.352031", + "step": 3020, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01975194923579693, + "timestamp": "2025-09-04 04:12:41.374506", + "step": 3021, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:41.485738", + "step": 3021, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002616028068587184, + "timestamp": "2025-09-04 04:12:41.506249", + "step": 3022, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:12:41.626978", + "step": 3022, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014300533570349216, + "timestamp": "2025-09-04 04:12:41.648721", + "step": 3023, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:12:41.733490", + "step": 3023, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020479435101151466, + "timestamp": "2025-09-04 04:12:41.749264", + "step": 3024, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:41.843852", + "step": 3024, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005946870427578688, + "timestamp": "2025-09-04 04:12:41.863014", + "step": 3025, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:41.966192", + "step": 3025, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020112913101911545, + "timestamp": "2025-09-04 04:12:41.984766", + "step": 3026, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:42.080065", + "step": 3026, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003505939384922385, + "timestamp": "2025-09-04 04:12:42.097153", + "step": 3027, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:12:42.212691", + "step": 3027, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018574239686131477, + "timestamp": "2025-09-04 04:12:42.234095", + "step": 3028, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:42.336816", + "step": 3028, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006098731886595488, + "timestamp": "2025-09-04 04:12:42.357909", + "step": 3029, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:42.466999", + "step": 3029, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011256312020123005, + "timestamp": "2025-09-04 04:12:42.484487", + "step": 3030, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:42.581316", + "step": 3030, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02321520633995533, + "timestamp": "2025-09-04 04:12:42.598704", + "step": 3031, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:42.695635", + "step": 3031, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013380464166402817, + "timestamp": "2025-09-04 04:12:42.713836", + "step": 3032, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:12:42.827414", + "step": 3032, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06969982385635376, + "timestamp": "2025-09-04 04:12:42.849634", + "step": 3033, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:42.960351", + "step": 3033, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001032329280860722, + "timestamp": "2025-09-04 04:12:42.980870", + "step": 3034, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:43.081469", + "step": 3034, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00046730105532333255, + "timestamp": "2025-09-04 04:12:43.099978", + "step": 3035, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:43.202230", + "step": 3035, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037853161338716745, + "timestamp": "2025-09-04 04:12:43.221820", + "step": 3036, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:12:43.303940", + "step": 3036, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006179303163662553, + "timestamp": "2025-09-04 04:12:43.320561", + "step": 3037, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:43.411798", + "step": 3037, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017078617587685585, + "timestamp": "2025-09-04 04:12:43.428547", + "step": 3038, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:43.526095", + "step": 3038, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008244368946179748, + "timestamp": "2025-09-04 04:12:43.543550", + "step": 3039, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:43.648130", + "step": 3039, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033969872165471315, + "timestamp": "2025-09-04 04:12:43.668121", + "step": 3040, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:12:52.082447", + "step": 3040, + "epoch": 3 + }, + { + "type": "pplx", + "content": 313.3448310983682, + "timestamp": "2025-09-04 04:12:52.084610", + "step": 3040, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3040", + "timestamp": "2025-09-04 04:12:52.615369", + "step": 3040, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:52.716905", + "step": 3040, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007473244331777096, + "timestamp": "2025-09-04 04:12:52.737524", + "step": 3041, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1488 + ], + "flops": 29760180728640.0 + }, + "timestamp": "2025-09-04 04:12:52.963136", + "step": 3041, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020905376877635717, + "timestamp": "2025-09-04 04:12:53.005306", + "step": 3042, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:53.111301", + "step": 3042, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004675985313951969, + "timestamp": "2025-09-04 04:12:53.130386", + "step": 3043, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:53.226870", + "step": 3043, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004783936310559511, + "timestamp": "2025-09-04 04:12:53.244571", + "step": 3044, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:12:53.320321", + "step": 3044, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024307608604431152, + "timestamp": "2025-09-04 04:12:53.334873", + "step": 3045, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:53.448940", + "step": 3045, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014469543239101768, + "timestamp": "2025-09-04 04:12:53.469323", + "step": 3046, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:12:53.556850", + "step": 3046, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01129270438104868, + "timestamp": "2025-09-04 04:12:53.572473", + "step": 3047, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:53.663644", + "step": 3047, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009008025750517845, + "timestamp": "2025-09-04 04:12:53.681178", + "step": 3048, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:12:53.784218", + "step": 3048, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05535700172185898, + "timestamp": "2025-09-04 04:12:53.806182", + "step": 3049, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:12:53.908028", + "step": 3049, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00018787295266520232, + "timestamp": "2025-09-04 04:12:53.927211", + "step": 3050, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:12:54.021968", + "step": 3050, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013808540534228086, + "timestamp": "2025-09-04 04:12:54.039368", + "step": 3051, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:12:54.131919", + "step": 3051, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001885344390757382, + "timestamp": "2025-09-04 04:12:54.149844", + "step": 3052, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:12:54.250441", + "step": 3052, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007413564249873161, + "timestamp": "2025-09-04 04:12:54.271642", + "step": 3053, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:54.365785", + "step": 3053, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004210531245917082, + "timestamp": "2025-09-04 04:12:54.383238", + "step": 3054, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:12:54.492486", + "step": 3054, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004505162592977285, + "timestamp": "2025-09-04 04:12:54.513000", + "step": 3055, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:12:54.604034", + "step": 3055, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008125255117192864, + "timestamp": "2025-09-04 04:12:54.621731", + "step": 3056, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:12:54.702929", + "step": 3056, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011885403655469418, + "timestamp": "2025-09-04 04:12:54.719635", + "step": 3057, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:12:54.814604", + "step": 3057, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004823492839932442, + "timestamp": "2025-09-04 04:12:54.832086", + "step": 3058, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:12:54.932251", + "step": 3058, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023230794817209244, + "timestamp": "2025-09-04 04:12:54.951261", + "step": 3059, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:12:55.052619", + "step": 3059, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06023210659623146, + "timestamp": "2025-09-04 04:12:55.071941", + "step": 3060, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:03.577234", + "step": 3060, + "epoch": 3 + }, + { + "type": "pplx", + "content": 310.1316931403451, + "timestamp": "2025-09-04 04:13:03.579410", + "step": 3060, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:13:03.660065", + "step": 3060, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005574073176831007, + "timestamp": "2025-09-04 04:13:03.676255", + "step": 3061, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:03.783932", + "step": 3061, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004931015428155661, + "timestamp": "2025-09-04 04:13:03.803610", + "step": 3062, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:03.910704", + "step": 3062, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005763449240475893, + "timestamp": "2025-09-04 04:13:03.927601", + "step": 3063, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:13:04.032696", + "step": 3063, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015147325582802296, + "timestamp": "2025-09-04 04:13:04.052545", + "step": 3064, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:04.151878", + "step": 3064, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013065881095826626, + "timestamp": "2025-09-04 04:13:04.172337", + "step": 3065, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:13:04.257765", + "step": 3065, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06708889454603195, + "timestamp": "2025-09-04 04:13:04.272762", + "step": 3066, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:13:04.478255", + "step": 3066, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030464141163975, + "timestamp": "2025-09-04 04:13:04.517184", + "step": 3067, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:13:04.629622", + "step": 3067, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004817434120923281, + "timestamp": "2025-09-04 04:13:04.650836", + "step": 3068, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:13:04.727967", + "step": 3068, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010416961275041103, + "timestamp": "2025-09-04 04:13:04.742894", + "step": 3069, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:13:04.853693", + "step": 3069, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01584682986140251, + "timestamp": "2025-09-04 04:13:04.873990", + "step": 3070, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:13:04.953426", + "step": 3070, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0056066811084747314, + "timestamp": "2025-09-04 04:13:04.967223", + "step": 3071, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:13:05.041133", + "step": 3071, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0062073455192148685, + "timestamp": "2025-09-04 04:13:05.054608", + "step": 3072, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:13:05.176203", + "step": 3072, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007986130192875862, + "timestamp": "2025-09-04 04:13:05.201566", + "step": 3073, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:13:05.280097", + "step": 3073, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002340728882700205, + "timestamp": "2025-09-04 04:13:05.293930", + "step": 3074, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 04:13:05.428660", + "step": 3074, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01518856268376112, + "timestamp": "2025-09-04 04:13:05.453088", + "step": 3075, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:05.553750", + "step": 3075, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011933338828384876, + "timestamp": "2025-09-04 04:13:05.573036", + "step": 3076, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:05.665519", + "step": 3076, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019831262528896332, + "timestamp": "2025-09-04 04:13:05.684346", + "step": 3077, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:05.779207", + "step": 3077, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00772702693939209, + "timestamp": "2025-09-04 04:13:05.796138", + "step": 3078, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:13:05.886183", + "step": 3078, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03503193333745003, + "timestamp": "2025-09-04 04:13:05.901623", + "step": 3079, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:05.996518", + "step": 3079, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03195195645093918, + "timestamp": "2025-09-04 04:13:06.014248", + "step": 3080, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:14.451244", + "step": 3080, + "epoch": 3 + }, + { + "type": "pplx", + "content": 309.69911719013754, + "timestamp": "2025-09-04 04:13:14.453284", + "step": 3080, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3080", + "timestamp": "2025-09-04 04:13:14.996139", + "step": 3080, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:15.099957", + "step": 3080, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005432614590972662, + "timestamp": "2025-09-04 04:13:15.122154", + "step": 3081, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:15.215538", + "step": 3081, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.07827738672494888, + "timestamp": "2025-09-04 04:13:15.232831", + "step": 3082, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:13:15.335844", + "step": 3082, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020832966547459364, + "timestamp": "2025-09-04 04:13:15.355072", + "step": 3083, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:15.464323", + "step": 3083, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003370664082467556, + "timestamp": "2025-09-04 04:13:15.485209", + "step": 3084, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:15.590624", + "step": 3084, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0391119010746479, + "timestamp": "2025-09-04 04:13:15.612896", + "step": 3085, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:15.719186", + "step": 3085, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0071410383097827435, + "timestamp": "2025-09-04 04:13:15.739191", + "step": 3086, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:15.846022", + "step": 3086, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.044355932623147964, + "timestamp": "2025-09-04 04:13:15.866073", + "step": 3087, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:15.971664", + "step": 3087, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02203553542494774, + "timestamp": "2025-09-04 04:13:15.992432", + "step": 3088, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:13:16.066807", + "step": 3088, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002954278141260147, + "timestamp": "2025-09-04 04:13:16.081846", + "step": 3089, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:13:16.163704", + "step": 3089, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011478732340037823, + "timestamp": "2025-09-04 04:13:16.178843", + "step": 3090, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:13:16.274385", + "step": 3090, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03504842892289162, + "timestamp": "2025-09-04 04:13:16.291859", + "step": 3091, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:13:16.368898", + "step": 3091, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0484900027513504, + "timestamp": "2025-09-04 04:13:16.383661", + "step": 3092, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:16.472091", + "step": 3092, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02142166718840599, + "timestamp": "2025-09-04 04:13:16.490533", + "step": 3093, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:13:16.574016", + "step": 3093, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017112598288804293, + "timestamp": "2025-09-04 04:13:16.589190", + "step": 3094, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:16.698367", + "step": 3094, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018777156947180629, + "timestamp": "2025-09-04 04:13:16.718616", + "step": 3095, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:16.818290", + "step": 3095, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04635737091302872, + "timestamp": "2025-09-04 04:13:16.837679", + "step": 3096, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:13:16.918617", + "step": 3096, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000942113867495209, + "timestamp": "2025-09-04 04:13:16.933954", + "step": 3097, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:13:17.010857", + "step": 3097, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0178525447845459, + "timestamp": "2025-09-04 04:13:17.024613", + "step": 3098, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:13:17.100145", + "step": 3098, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002621579449623823, + "timestamp": "2025-09-04 04:13:17.113931", + "step": 3099, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:13:17.197423", + "step": 3099, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003176899626851082, + "timestamp": "2025-09-04 04:13:17.213366", + "step": 3100, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:25.591497", + "step": 3100, + "epoch": 3 + }, + { + "type": "pplx", + "content": 309.6383679000778, + "timestamp": "2025-09-04 04:13:25.594304", + "step": 3100, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:13:25.708607", + "step": 3100, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018567724153399467, + "timestamp": "2025-09-04 04:13:25.732430", + "step": 3101, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:13:25.836648", + "step": 3101, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001145319314673543, + "timestamp": "2025-09-04 04:13:25.855894", + "step": 3102, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:25.953996", + "step": 3102, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030605667270720005, + "timestamp": "2025-09-04 04:13:25.972593", + "step": 3103, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:13:26.083056", + "step": 3103, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003627515397965908, + "timestamp": "2025-09-04 04:13:26.104379", + "step": 3104, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:26.195117", + "step": 3104, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005803365493193269, + "timestamp": "2025-09-04 04:13:26.214144", + "step": 3105, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:13:26.288628", + "step": 3105, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001379349734634161, + "timestamp": "2025-09-04 04:13:26.302105", + "step": 3106, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:13:26.411779", + "step": 3106, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01873205043375492, + "timestamp": "2025-09-04 04:13:26.432178", + "step": 3107, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:13:26.535336", + "step": 3107, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028933845460414886, + "timestamp": "2025-09-04 04:13:26.555296", + "step": 3108, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:26.660819", + "step": 3108, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001489466754719615, + "timestamp": "2025-09-04 04:13:26.682708", + "step": 3109, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:13:26.757809", + "step": 3109, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002094303723424673, + "timestamp": "2025-09-04 04:13:26.771384", + "step": 3110, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:26.866276", + "step": 3110, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006700966041535139, + "timestamp": "2025-09-04 04:13:26.883690", + "step": 3111, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:26.983151", + "step": 3111, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01674632728099823, + "timestamp": "2025-09-04 04:13:27.002754", + "step": 3112, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:27.105954", + "step": 3112, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.08993439376354218, + "timestamp": "2025-09-04 04:13:27.127787", + "step": 3113, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:13:27.237145", + "step": 3113, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012733870185911655, + "timestamp": "2025-09-04 04:13:27.257788", + "step": 3114, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:13:27.330723", + "step": 3114, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025248989462852478, + "timestamp": "2025-09-04 04:13:27.343584", + "step": 3115, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:27.437623", + "step": 3115, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020341791678220034, + "timestamp": "2025-09-04 04:13:27.455792", + "step": 3116, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:27.547797", + "step": 3116, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012790296459570527, + "timestamp": "2025-09-04 04:13:27.566869", + "step": 3117, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:13:27.670689", + "step": 3117, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003262518672272563, + "timestamp": "2025-09-04 04:13:27.689875", + "step": 3118, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:27.782359", + "step": 3118, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04332924634218216, + "timestamp": "2025-09-04 04:13:27.799486", + "step": 3119, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:13:27.894946", + "step": 3119, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012113949051126838, + "timestamp": "2025-09-04 04:13:27.913216", + "step": 3120, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:36.378564", + "step": 3120, + "epoch": 3 + }, + { + "type": "pplx", + "content": 307.53064608444055, + "timestamp": "2025-09-04 04:13:36.380625", + "step": 3120, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3120", + "timestamp": "2025-09-04 04:13:36.827667", + "step": 3120, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:13:36.933924", + "step": 3120, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004694383533205837, + "timestamp": "2025-09-04 04:13:36.956474", + "step": 3121, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:37.050096", + "step": 3121, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00680502038449049, + "timestamp": "2025-09-04 04:13:37.067253", + "step": 3122, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:37.166392", + "step": 3122, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009919785894453526, + "timestamp": "2025-09-04 04:13:37.184994", + "step": 3123, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:13:37.286944", + "step": 3123, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016770748421549797, + "timestamp": "2025-09-04 04:13:37.307060", + "step": 3124, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:37.398424", + "step": 3124, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005784905049949884, + "timestamp": "2025-09-04 04:13:37.417173", + "step": 3125, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:37.517434", + "step": 3125, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02264421060681343, + "timestamp": "2025-09-04 04:13:37.536320", + "step": 3126, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 04:13:37.600702", + "step": 3126, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004687449894845486, + "timestamp": "2025-09-04 04:13:37.611963", + "step": 3127, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:13:37.728323", + "step": 3127, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.051042910665273666, + "timestamp": "2025-09-04 04:13:37.751279", + "step": 3128, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:37.840140", + "step": 3128, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002769632264971733, + "timestamp": "2025-09-04 04:13:37.858257", + "step": 3129, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:13:37.936618", + "step": 3129, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011999246664345264, + "timestamp": "2025-09-04 04:13:37.950724", + "step": 3130, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:38.050492", + "step": 3130, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005194882862269878, + "timestamp": "2025-09-04 04:13:38.069413", + "step": 3131, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:38.169541", + "step": 3131, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.037000637501478195, + "timestamp": "2025-09-04 04:13:38.189321", + "step": 3132, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:13:38.273497", + "step": 3132, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010427704080939293, + "timestamp": "2025-09-04 04:13:38.290645", + "step": 3133, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:13:38.396605", + "step": 3133, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002125280909240246, + "timestamp": "2025-09-04 04:13:38.416728", + "step": 3134, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:38.512019", + "step": 3134, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023927779402583838, + "timestamp": "2025-09-04 04:13:38.529572", + "step": 3135, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:38.628419", + "step": 3135, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00024334284535143524, + "timestamp": "2025-09-04 04:13:38.647962", + "step": 3136, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:38.738054", + "step": 3136, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012799968244507909, + "timestamp": "2025-09-04 04:13:38.756943", + "step": 3137, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:13:38.859262", + "step": 3137, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010953391902148724, + "timestamp": "2025-09-04 04:13:38.878460", + "step": 3138, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:13:38.956198", + "step": 3138, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005597640410996974, + "timestamp": "2025-09-04 04:13:38.970039", + "step": 3139, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:39.060351", + "step": 3139, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017131542786955833, + "timestamp": "2025-09-04 04:13:39.077857", + "step": 3140, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:47.459705", + "step": 3140, + "epoch": 3 + }, + { + "type": "pplx", + "content": 303.543175064668, + "timestamp": "2025-09-04 04:13:47.461952", + "step": 3140, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:13:47.544243", + "step": 3140, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005871990229934454, + "timestamp": "2025-09-04 04:13:47.561346", + "step": 3141, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:13:47.683316", + "step": 3141, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004386106040328741, + "timestamp": "2025-09-04 04:13:47.706499", + "step": 3142, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:47.800908", + "step": 3142, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0068616243079304695, + "timestamp": "2025-09-04 04:13:47.818294", + "step": 3143, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:47.911953", + "step": 3143, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012470950605347753, + "timestamp": "2025-09-04 04:13:47.930050", + "step": 3144, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:13:48.062441", + "step": 3144, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008273441344499588, + "timestamp": "2025-09-04 04:13:48.090809", + "step": 3145, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:48.190377", + "step": 3145, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05135876312851906, + "timestamp": "2025-09-04 04:13:48.209014", + "step": 3146, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:48.301863", + "step": 3146, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026613635942339897, + "timestamp": "2025-09-04 04:13:48.318989", + "step": 3147, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:48.427395", + "step": 3147, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006781783886253834, + "timestamp": "2025-09-04 04:13:48.448382", + "step": 3148, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:48.540151", + "step": 3148, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013767415657639503, + "timestamp": "2025-09-04 04:13:48.558954", + "step": 3149, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:13:48.663250", + "step": 3149, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06130528450012207, + "timestamp": "2025-09-04 04:13:48.682512", + "step": 3150, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:13:48.792712", + "step": 3150, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011846323497593403, + "timestamp": "2025-09-04 04:13:48.813358", + "step": 3151, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:48.904857", + "step": 3151, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015273387543857098, + "timestamp": "2025-09-04 04:13:48.922531", + "step": 3152, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:13:49.043109", + "step": 3152, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032542271073907614, + "timestamp": "2025-09-04 04:13:49.068724", + "step": 3153, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:49.161574", + "step": 3153, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002285633934661746, + "timestamp": "2025-09-04 04:13:49.178731", + "step": 3154, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:13:49.280117", + "step": 3154, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009557440876960754, + "timestamp": "2025-09-04 04:13:49.299107", + "step": 3155, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:49.389864", + "step": 3155, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018284492194652557, + "timestamp": "2025-09-04 04:13:49.407496", + "step": 3156, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:13:49.498858", + "step": 3156, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0062835500575602055, + "timestamp": "2025-09-04 04:13:49.517799", + "step": 3157, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:13:49.595561", + "step": 3157, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006542867515236139, + "timestamp": "2025-09-04 04:13:49.609201", + "step": 3158, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:49.708783", + "step": 3158, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004803582560271025, + "timestamp": "2025-09-04 04:13:49.727388", + "step": 3159, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:13:49.813834", + "step": 3159, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005237384233623743, + "timestamp": "2025-09-04 04:13:49.830356", + "step": 3160, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:13:58.274255", + "step": 3160, + "epoch": 3 + }, + { + "type": "pplx", + "content": 299.2577897885672, + "timestamp": "2025-09-04 04:13:58.276754", + "step": 3160, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3160", + "timestamp": "2025-09-04 04:13:58.818167", + "step": 3160, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:13:58.901615", + "step": 3160, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012152140960097313, + "timestamp": "2025-09-04 04:13:58.917815", + "step": 3161, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:59.010068", + "step": 3161, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011444678530097008, + "timestamp": "2025-09-04 04:13:59.026556", + "step": 3162, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:13:59.127832", + "step": 3162, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0307242963463068, + "timestamp": "2025-09-04 04:13:59.146322", + "step": 3163, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:13:59.262752", + "step": 3163, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009461762383580208, + "timestamp": "2025-09-04 04:13:59.284146", + "step": 3164, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:13:59.373507", + "step": 3164, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035491350572556257, + "timestamp": "2025-09-04 04:13:59.391646", + "step": 3165, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:13:59.501388", + "step": 3165, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005383518873713911, + "timestamp": "2025-09-04 04:13:59.521601", + "step": 3166, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:13:59.632898", + "step": 3166, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005555047653615475, + "timestamp": "2025-09-04 04:13:59.653292", + "step": 3167, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:13:59.738310", + "step": 3167, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026055641938000917, + "timestamp": "2025-09-04 04:13:59.754242", + "step": 3168, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:13:59.846632", + "step": 3168, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007750915363430977, + "timestamp": "2025-09-04 04:13:59.865546", + "step": 3169, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:13:59.975571", + "step": 3169, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030947135761380196, + "timestamp": "2025-09-04 04:13:59.996080", + "step": 3170, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:00.095959", + "step": 3170, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008231641724705696, + "timestamp": "2025-09-04 04:14:00.113266", + "step": 3171, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:00.217518", + "step": 3171, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0042179482989013195, + "timestamp": "2025-09-04 04:14:00.237314", + "step": 3172, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:14:00.346381", + "step": 3172, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010643589776009321, + "timestamp": "2025-09-04 04:14:00.368931", + "step": 3173, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:00.473465", + "step": 3173, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01745988056063652, + "timestamp": "2025-09-04 04:14:00.492705", + "step": 3174, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:14:00.586556", + "step": 3174, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.040873829275369644, + "timestamp": "2025-09-04 04:14:00.603677", + "step": 3175, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:00.705209", + "step": 3175, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004095417389180511, + "timestamp": "2025-09-04 04:14:00.724761", + "step": 3176, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:00.826009", + "step": 3176, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05139699578285217, + "timestamp": "2025-09-04 04:14:00.847061", + "step": 3177, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:00.947870", + "step": 3177, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01573573239147663, + "timestamp": "2025-09-04 04:14:00.966290", + "step": 3178, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:14:01.038915", + "step": 3178, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004238112363964319, + "timestamp": "2025-09-04 04:14:01.051828", + "step": 3179, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:01.146291", + "step": 3179, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013266210444271564, + "timestamp": "2025-09-04 04:14:01.164460", + "step": 3180, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:14:09.548016", + "step": 3180, + "epoch": 3 + }, + { + "type": "pplx", + "content": 297.72411337269, + "timestamp": "2025-09-04 04:14:09.549979", + "step": 3180, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:09.645901", + "step": 3180, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004828155972063541, + "timestamp": "2025-09-04 04:14:09.666733", + "step": 3181, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:14:09.737653", + "step": 3181, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00019382215396035463, + "timestamp": "2025-09-04 04:14:09.750394", + "step": 3182, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:09.843532", + "step": 3182, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012143196538090706, + "timestamp": "2025-09-04 04:14:09.860957", + "step": 3183, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:09.966472", + "step": 3183, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012787654995918274, + "timestamp": "2025-09-04 04:14:09.987350", + "step": 3184, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:10.101021", + "step": 3184, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021711078006774187, + "timestamp": "2025-09-04 04:14:10.122082", + "step": 3185, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:14:10.199346", + "step": 3185, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016758840531110764, + "timestamp": "2025-09-04 04:14:10.213436", + "step": 3186, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:10.318855", + "step": 3186, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012713070027530193, + "timestamp": "2025-09-04 04:14:10.338888", + "step": 3187, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:10.441584", + "step": 3187, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002686247928068042, + "timestamp": "2025-09-04 04:14:10.461620", + "step": 3188, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:14:10.567666", + "step": 3188, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009823828935623169, + "timestamp": "2025-09-04 04:14:10.590298", + "step": 3189, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:14:10.673129", + "step": 3189, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011689892038702965, + "timestamp": "2025-09-04 04:14:10.688286", + "step": 3190, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:14:10.771771", + "step": 3190, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027343537658452988, + "timestamp": "2025-09-04 04:14:10.787075", + "step": 3191, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:10.886962", + "step": 3191, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008726535364985466, + "timestamp": "2025-09-04 04:14:10.906597", + "step": 3192, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:11.007668", + "step": 3192, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006816827226430178, + "timestamp": "2025-09-04 04:14:11.028767", + "step": 3193, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:14:11.141248", + "step": 3193, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00433404790237546, + "timestamp": "2025-09-04 04:14:11.161661", + "step": 3194, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:14:11.271042", + "step": 3194, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005224962718784809, + "timestamp": "2025-09-04 04:14:11.291628", + "step": 3195, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:11.402317", + "step": 3195, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00549314683303237, + "timestamp": "2025-09-04 04:14:11.421990", + "step": 3196, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:11.512659", + "step": 3196, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015487665310502052, + "timestamp": "2025-09-04 04:14:11.531859", + "step": 3197, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:11.627288", + "step": 3197, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005331105552613735, + "timestamp": "2025-09-04 04:14:11.644713", + "step": 3198, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:11.745595", + "step": 3198, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038703870959579945, + "timestamp": "2025-09-04 04:14:11.764490", + "step": 3199, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:14:11.841979", + "step": 3199, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013186764903366566, + "timestamp": "2025-09-04 04:14:11.856696", + "step": 3200, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:14:20.282753", + "step": 3200, + "epoch": 3 + }, + { + "type": "pplx", + "content": 298.56920811002476, + "timestamp": "2025-09-04 04:14:20.284562", + "step": 3200, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3200", + "timestamp": "2025-09-04 04:14:20.791042", + "step": 3200, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:14:20.863498", + "step": 3200, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027715997770428658, + "timestamp": "2025-09-04 04:14:20.878154", + "step": 3201, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:20.982383", + "step": 3201, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028067301027476788, + "timestamp": "2025-09-04 04:14:21.001527", + "step": 3202, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:21.100913", + "step": 3202, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029784520156681538, + "timestamp": "2025-09-04 04:14:21.119598", + "step": 3203, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:21.215903", + "step": 3203, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007651972700841725, + "timestamp": "2025-09-04 04:14:21.234311", + "step": 3204, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:21.335668", + "step": 3204, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03393160179257393, + "timestamp": "2025-09-04 04:14:21.356658", + "step": 3205, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:21.452878", + "step": 3205, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033680200576782227, + "timestamp": "2025-09-04 04:14:21.470525", + "step": 3206, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:14:21.556580", + "step": 3206, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022258544340729713, + "timestamp": "2025-09-04 04:14:21.572030", + "step": 3207, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:21.666815", + "step": 3207, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000977307092398405, + "timestamp": "2025-09-04 04:14:21.685202", + "step": 3208, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:14:21.802759", + "step": 3208, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009067521430552006, + "timestamp": "2025-09-04 04:14:21.826562", + "step": 3209, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:21.930639", + "step": 3209, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005365348886698484, + "timestamp": "2025-09-04 04:14:21.949872", + "step": 3210, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:22.049801", + "step": 3210, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016756800934672356, + "timestamp": "2025-09-04 04:14:22.068544", + "step": 3211, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:14:22.152516", + "step": 3211, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015038455603644252, + "timestamp": "2025-09-04 04:14:22.167154", + "step": 3212, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:14:22.248469", + "step": 3212, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002096136100590229, + "timestamp": "2025-09-04 04:14:22.265055", + "step": 3213, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:14:22.347176", + "step": 3213, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006991859059780836, + "timestamp": "2025-09-04 04:14:22.362368", + "step": 3214, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:22.465008", + "step": 3214, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004486733116209507, + "timestamp": "2025-09-04 04:14:22.484327", + "step": 3215, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:14:22.574918", + "step": 3215, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015569731593132019, + "timestamp": "2025-09-04 04:14:22.592568", + "step": 3216, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:22.690066", + "step": 3216, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02023530937731266, + "timestamp": "2025-09-04 04:14:22.710895", + "step": 3217, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:14:22.834269", + "step": 3217, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009704073891043663, + "timestamp": "2025-09-04 04:14:22.857521", + "step": 3218, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:22.960755", + "step": 3218, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00012312929902691394, + "timestamp": "2025-09-04 04:14:22.980128", + "step": 3219, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:14:23.101629", + "step": 3219, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024548498913645744, + "timestamp": "2025-09-04 04:14:23.125454", + "step": 3220, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:14:31.604710", + "step": 3220, + "epoch": 3 + }, + { + "type": "pplx", + "content": 302.84948080396117, + "timestamp": "2025-09-04 04:14:31.606931", + "step": 3220, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:14:31.688935", + "step": 3220, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03970217704772949, + "timestamp": "2025-09-04 04:14:31.706195", + "step": 3221, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:31.806815", + "step": 3221, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0249653160572052, + "timestamp": "2025-09-04 04:14:31.825636", + "step": 3222, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:14:31.903292", + "step": 3222, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00563463568687439, + "timestamp": "2025-09-04 04:14:31.917510", + "step": 3223, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:14:32.010566", + "step": 3223, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012554388493299484, + "timestamp": "2025-09-04 04:14:32.028457", + "step": 3224, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:14:32.118821", + "step": 3224, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.045332036912441254, + "timestamp": "2025-09-04 04:14:32.137174", + "step": 3225, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:32.238227", + "step": 3225, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016650618053972721, + "timestamp": "2025-09-04 04:14:32.257069", + "step": 3226, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:32.350727", + "step": 3226, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0041397614404559135, + "timestamp": "2025-09-04 04:14:32.368149", + "step": 3227, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:32.462491", + "step": 3227, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027826486621052027, + "timestamp": "2025-09-04 04:14:32.480741", + "step": 3228, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:14:32.568652", + "step": 3228, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007442697882652283, + "timestamp": "2025-09-04 04:14:32.587093", + "step": 3229, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:14:32.665351", + "step": 3229, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006308171898126602, + "timestamp": "2025-09-04 04:14:32.679438", + "step": 3230, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:32.772202", + "step": 3230, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004488340055104345, + "timestamp": "2025-09-04 04:14:32.789529", + "step": 3231, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:14:32.866260", + "step": 3231, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021919619757682085, + "timestamp": "2025-09-04 04:14:32.881055", + "step": 3232, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:14:32.971578", + "step": 3232, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007420377805829048, + "timestamp": "2025-09-04 04:14:32.990460", + "step": 3233, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:33.092494", + "step": 3233, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002644237130880356, + "timestamp": "2025-09-04 04:14:33.111654", + "step": 3234, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:14:33.247913", + "step": 3234, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011826629051938653, + "timestamp": "2025-09-04 04:14:33.274252", + "step": 3235, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:33.375785", + "step": 3235, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02761908806860447, + "timestamp": "2025-09-04 04:14:33.395525", + "step": 3236, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:33.497828", + "step": 3236, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06943929940462112, + "timestamp": "2025-09-04 04:14:33.519104", + "step": 3237, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:33.618795", + "step": 3237, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012753068469464779, + "timestamp": "2025-09-04 04:14:33.637467", + "step": 3238, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:33.741847", + "step": 3238, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008138732053339481, + "timestamp": "2025-09-04 04:14:33.761229", + "step": 3239, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:33.864802", + "step": 3239, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004093741299584508, + "timestamp": "2025-09-04 04:14:33.884886", + "step": 3240, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:14:42.381848", + "step": 3240, + "epoch": 3 + }, + { + "type": "pplx", + "content": 302.56651160009335, + "timestamp": "2025-09-04 04:14:42.383805", + "step": 3240, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3240", + "timestamp": "2025-09-04 04:14:42.856128", + "step": 3240, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:14:42.923762", + "step": 3240, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003945726901292801, + "timestamp": "2025-09-04 04:14:42.937279", + "step": 3241, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:14:43.015815", + "step": 3241, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013066886458545923, + "timestamp": "2025-09-04 04:14:43.030023", + "step": 3242, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:43.128688", + "step": 3242, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005977214314043522, + "timestamp": "2025-09-04 04:14:43.147386", + "step": 3243, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:43.247664", + "step": 3243, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006708620116114616, + "timestamp": "2025-09-04 04:14:43.267330", + "step": 3244, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:43.366677", + "step": 3244, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010317733511328697, + "timestamp": "2025-09-04 04:14:43.387825", + "step": 3245, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:14:43.497352", + "step": 3245, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001994991209357977, + "timestamp": "2025-09-04 04:14:43.518042", + "step": 3246, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:43.613568", + "step": 3246, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030987162608653307, + "timestamp": "2025-09-04 04:14:43.631222", + "step": 3247, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:14:43.740095", + "step": 3247, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005180987645871937, + "timestamp": "2025-09-04 04:14:43.761304", + "step": 3248, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:43.852971", + "step": 3248, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00039079232374206185, + "timestamp": "2025-09-04 04:14:43.872164", + "step": 3249, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:43.966444", + "step": 3249, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00027972026146017015, + "timestamp": "2025-09-04 04:14:43.984013", + "step": 3250, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:14:44.093314", + "step": 3250, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020345128141343594, + "timestamp": "2025-09-04 04:14:44.113985", + "step": 3251, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:44.214840", + "step": 3251, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008928967639803886, + "timestamp": "2025-09-04 04:14:44.234614", + "step": 3252, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:44.326751", + "step": 3252, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0039917477406561375, + "timestamp": "2025-09-04 04:14:44.346001", + "step": 3253, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:14:44.448569", + "step": 3253, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008001399110071361, + "timestamp": "2025-09-04 04:14:44.467465", + "step": 3254, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:44.562224", + "step": 3254, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018813287606462836, + "timestamp": "2025-09-04 04:14:44.579611", + "step": 3255, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:14:44.665229", + "step": 3255, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0071727619506418705, + "timestamp": "2025-09-04 04:14:44.681606", + "step": 3256, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:14:44.755422", + "step": 3256, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019573703408241272, + "timestamp": "2025-09-04 04:14:44.770563", + "step": 3257, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:14:44.879668", + "step": 3257, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010637468658387661, + "timestamp": "2025-09-04 04:14:44.900200", + "step": 3258, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:44.995225", + "step": 3258, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005085250595584512, + "timestamp": "2025-09-04 04:14:45.012783", + "step": 3259, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:45.118189", + "step": 3259, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004459428135305643, + "timestamp": "2025-09-04 04:14:45.139052", + "step": 3260, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:14:53.619951", + "step": 3260, + "epoch": 3 + }, + { + "type": "pplx", + "content": 303.6349730675357, + "timestamp": "2025-09-04 04:14:53.621797", + "step": 3260, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:53.720593", + "step": 3260, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007222423795610666, + "timestamp": "2025-09-04 04:14:53.741702", + "step": 3261, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:14:53.833132", + "step": 3261, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010594201739877462, + "timestamp": "2025-09-04 04:14:53.849926", + "step": 3262, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:53.949469", + "step": 3262, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002619499806314707, + "timestamp": "2025-09-04 04:14:53.968160", + "step": 3263, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:14:54.069273", + "step": 3263, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002300729276612401, + "timestamp": "2025-09-04 04:14:54.088775", + "step": 3264, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:14:54.193743", + "step": 3264, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029513502959161997, + "timestamp": "2025-09-04 04:14:54.215967", + "step": 3265, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:54.311347", + "step": 3265, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033043273724615574, + "timestamp": "2025-09-04 04:14:54.328764", + "step": 3266, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:14:54.423680", + "step": 3266, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0076232897117733955, + "timestamp": "2025-09-04 04:14:54.441064", + "step": 3267, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:14:54.549806", + "step": 3267, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012323771603405476, + "timestamp": "2025-09-04 04:14:54.570969", + "step": 3268, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:54.677234", + "step": 3268, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014858097769320011, + "timestamp": "2025-09-04 04:14:54.699205", + "step": 3269, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:14:54.771795", + "step": 3269, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006694823037832975, + "timestamp": "2025-09-04 04:14:54.784684", + "step": 3270, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:14:54.870959", + "step": 3270, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0045067863538861275, + "timestamp": "2025-09-04 04:14:54.886529", + "step": 3271, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:54.994351", + "step": 3271, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029527172446250916, + "timestamp": "2025-09-04 04:14:55.015092", + "step": 3272, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:14:55.106218", + "step": 3272, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010578243993222713, + "timestamp": "2025-09-04 04:14:55.124807", + "step": 3273, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:14:55.217610", + "step": 3273, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004275831393897533, + "timestamp": "2025-09-04 04:14:55.234893", + "step": 3274, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:14:55.338811", + "step": 3274, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004477000795304775, + "timestamp": "2025-09-04 04:14:55.358098", + "step": 3275, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:14:55.454527", + "step": 3275, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003879428841173649, + "timestamp": "2025-09-04 04:14:55.472942", + "step": 3276, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:14:55.577939", + "step": 3276, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.06313685327768326, + "timestamp": "2025-09-04 04:14:55.599965", + "step": 3277, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 04:14:55.763310", + "step": 3277, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002089696703478694, + "timestamp": "2025-09-04 04:14:55.795506", + "step": 3278, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:14:55.880590", + "step": 3278, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007018198375590146, + "timestamp": "2025-09-04 04:14:55.895788", + "step": 3279, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:14:55.998173", + "step": 3279, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02527954801917076, + "timestamp": "2025-09-04 04:14:56.018237", + "step": 3280, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:04.467066", + "step": 3280, + "epoch": 3 + }, + { + "type": "pplx", + "content": 306.23200739000436, + "timestamp": "2025-09-04 04:15:04.469448", + "step": 3280, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3280", + "timestamp": "2025-09-04 04:15:04.982903", + "step": 3280, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:15:05.079346", + "step": 3280, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03897909075021744, + "timestamp": "2025-09-04 04:15:05.099648", + "step": 3281, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:05.202261", + "step": 3281, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016802679747343063, + "timestamp": "2025-09-04 04:15:05.221465", + "step": 3282, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:05.329841", + "step": 3282, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002108318265527487, + "timestamp": "2025-09-04 04:15:05.349197", + "step": 3283, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:05.452573", + "step": 3283, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00026367095415480435, + "timestamp": "2025-09-04 04:15:05.472578", + "step": 3284, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:15:05.563126", + "step": 3284, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026424103416502476, + "timestamp": "2025-09-04 04:15:05.581936", + "step": 3285, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:15:05.699833", + "step": 3285, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00553141999989748, + "timestamp": "2025-09-04 04:15:05.721891", + "step": 3286, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:05.814069", + "step": 3286, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009483684785664082, + "timestamp": "2025-09-04 04:15:05.830807", + "step": 3287, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:15:05.926426", + "step": 3287, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0044782888144254684, + "timestamp": "2025-09-04 04:15:05.944699", + "step": 3288, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:15:06.061273", + "step": 3288, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009773524943739176, + "timestamp": "2025-09-04 04:15:06.085178", + "step": 3289, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:06.192109", + "step": 3289, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00231315684504807, + "timestamp": "2025-09-04 04:15:06.212110", + "step": 3290, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:06.305889", + "step": 3290, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034602778032422066, + "timestamp": "2025-09-04 04:15:06.320028", + "step": 3291, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:15:06.402443", + "step": 3291, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005539227742701769, + "timestamp": "2025-09-04 04:15:06.418345", + "step": 3292, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:15:06.498804", + "step": 3292, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033372659236192703, + "timestamp": "2025-09-04 04:15:06.515338", + "step": 3293, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:06.635892", + "step": 3293, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007293337839655578, + "timestamp": "2025-09-04 04:15:06.655876", + "step": 3294, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:15:06.791530", + "step": 3294, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006761676049791276, + "timestamp": "2025-09-04 04:15:06.817475", + "step": 3295, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:15:06.904333", + "step": 3295, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.032214339822530746, + "timestamp": "2025-09-04 04:15:06.920789", + "step": 3296, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:15:06.994544", + "step": 3296, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004622759763151407, + "timestamp": "2025-09-04 04:15:07.009354", + "step": 3297, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 448 + ], + "flops": 8960054460160.0 + }, + "timestamp": "2025-09-04 04:15:07.081752", + "step": 3297, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00865192525088787, + "timestamp": "2025-09-04 04:15:07.094680", + "step": 3298, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:15:07.190603", + "step": 3298, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004962395876646042, + "timestamp": "2025-09-04 04:15:07.208103", + "step": 3299, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:15:07.306707", + "step": 3299, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004638133570551872, + "timestamp": "2025-09-04 04:15:07.324605", + "step": 3300, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:15.708785", + "step": 3300, + "epoch": 3 + }, + { + "type": "pplx", + "content": 310.93065235576375, + "timestamp": "2025-09-04 04:15:15.710762", + "step": 3300, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:15.809849", + "step": 3300, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00043890104279853404, + "timestamp": "2025-09-04 04:15:15.831027", + "step": 3301, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:15.936809", + "step": 3301, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031615160405635834, + "timestamp": "2025-09-04 04:15:15.956837", + "step": 3302, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:16.034876", + "step": 3302, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012209202162921429, + "timestamp": "2025-09-04 04:15:16.048981", + "step": 3303, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:16.139795", + "step": 3303, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01653936877846718, + "timestamp": "2025-09-04 04:15:16.157321", + "step": 3304, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:16.255858", + "step": 3304, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016090011224150658, + "timestamp": "2025-09-04 04:15:16.276637", + "step": 3305, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:16.379531", + "step": 3305, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0072241052985191345, + "timestamp": "2025-09-04 04:15:16.398754", + "step": 3306, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:16.499129", + "step": 3306, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002566951443441212, + "timestamp": "2025-09-04 04:15:16.517889", + "step": 3307, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:16.596145", + "step": 3307, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032684989273548126, + "timestamp": "2025-09-04 04:15:16.610992", + "step": 3308, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:15:16.685266", + "step": 3308, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016026400262489915, + "timestamp": "2025-09-04 04:15:16.700263", + "step": 3309, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:16.807189", + "step": 3309, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002181933494284749, + "timestamp": "2025-09-04 04:15:16.827297", + "step": 3310, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:16.937081", + "step": 3310, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002524265320971608, + "timestamp": "2025-09-04 04:15:16.957733", + "step": 3311, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:17.068037", + "step": 3311, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015753493644297123, + "timestamp": "2025-09-04 04:15:17.089325", + "step": 3312, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:15:17.173509", + "step": 3312, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010916945524513721, + "timestamp": "2025-09-04 04:15:17.190442", + "step": 3313, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:17.300267", + "step": 3313, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002287115901708603, + "timestamp": "2025-09-04 04:15:17.320788", + "step": 3314, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:17.411659", + "step": 3314, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02453738823533058, + "timestamp": "2025-09-04 04:15:17.428428", + "step": 3315, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:15:17.526468", + "step": 3315, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024574180133640766, + "timestamp": "2025-09-04 04:15:17.545929", + "step": 3316, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:17.634340", + "step": 3316, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003129773773252964, + "timestamp": "2025-09-04 04:15:17.652692", + "step": 3317, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:15:17.730020", + "step": 3317, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017890299204736948, + "timestamp": "2025-09-04 04:15:17.744156", + "step": 3318, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:17.846641", + "step": 3318, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003455560654401779, + "timestamp": "2025-09-04 04:15:17.865764", + "step": 3319, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:17.955724", + "step": 3319, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028852151706814766, + "timestamp": "2025-09-04 04:15:17.973313", + "step": 3320, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:26.366823", + "step": 3320, + "epoch": 3 + }, + { + "type": "pplx", + "content": 315.5775818717673, + "timestamp": "2025-09-04 04:15:26.368541", + "step": 3320, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3320", + "timestamp": "2025-09-04 04:15:26.717810", + "step": 3320, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:15:26.835668", + "step": 3320, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002499267866369337, + "timestamp": "2025-09-04 04:15:26.860977", + "step": 3321, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:15:26.945054", + "step": 3321, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008101621060632169, + "timestamp": "2025-09-04 04:15:26.960654", + "step": 3322, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:15:27.054780", + "step": 3322, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012700149789452553, + "timestamp": "2025-09-04 04:15:27.072188", + "step": 3323, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:15:27.172098", + "step": 3323, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007206479553133249, + "timestamp": "2025-09-04 04:15:27.191492", + "step": 3324, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:27.268978", + "step": 3324, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013278383761644363, + "timestamp": "2025-09-04 04:15:27.284463", + "step": 3325, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:27.395471", + "step": 3325, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004693718161433935, + "timestamp": "2025-09-04 04:15:27.414655", + "step": 3326, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:15:27.490883", + "step": 3326, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029808765277266502, + "timestamp": "2025-09-04 04:15:27.504664", + "step": 3327, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:27.608290", + "step": 3327, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00016743317246437073, + "timestamp": "2025-09-04 04:15:27.628357", + "step": 3328, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:15:27.703701", + "step": 3328, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018314022570848465, + "timestamp": "2025-09-04 04:15:27.718997", + "step": 3329, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:27.828684", + "step": 3329, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024994483683258295, + "timestamp": "2025-09-04 04:15:27.849182", + "step": 3330, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:15:27.935748", + "step": 3330, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00011107311001978815, + "timestamp": "2025-09-04 04:15:27.951411", + "step": 3331, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 848 + ], + "flops": 16960103024960.0 + }, + "timestamp": "2025-09-04 04:15:28.077014", + "step": 3331, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010431556962430477, + "timestamp": "2025-09-04 04:15:28.101871", + "step": 3332, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:15:28.193747", + "step": 3332, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03157045319676399, + "timestamp": "2025-09-04 04:15:28.212571", + "step": 3333, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:15:28.296301", + "step": 3333, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04083694517612457, + "timestamp": "2025-09-04 04:15:28.311522", + "step": 3334, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:15:28.405898", + "step": 3334, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034233976621180773, + "timestamp": "2025-09-04 04:15:28.423349", + "step": 3335, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:28.532837", + "step": 3335, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021260175853967667, + "timestamp": "2025-09-04 04:15:28.554152", + "step": 3336, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:28.651272", + "step": 3336, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015738222748041153, + "timestamp": "2025-09-04 04:15:28.671979", + "step": 3337, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:28.775677", + "step": 3337, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007692721672356129, + "timestamp": "2025-09-04 04:15:28.794901", + "step": 3338, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:28.896271", + "step": 3338, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014377308543771505, + "timestamp": "2025-09-04 04:15:28.915136", + "step": 3339, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:28.993680", + "step": 3339, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016688672825694084, + "timestamp": "2025-09-04 04:15:29.008612", + "step": 3340, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:37.370468", + "step": 3340, + "epoch": 3 + }, + { + "type": "pplx", + "content": 312.33404172021034, + "timestamp": "2025-09-04 04:15:37.372516", + "step": 3340, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:37.473968", + "step": 3340, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011500056833028793, + "timestamp": "2025-09-04 04:15:37.495856", + "step": 3341, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:37.606004", + "step": 3341, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0288741085678339, + "timestamp": "2025-09-04 04:15:37.626404", + "step": 3342, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:37.728252", + "step": 3342, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004480296280235052, + "timestamp": "2025-09-04 04:15:37.747168", + "step": 3343, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:15:37.842312", + "step": 3343, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002481586765497923, + "timestamp": "2025-09-04 04:15:37.860519", + "step": 3344, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:37.949491", + "step": 3344, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013428304344415665, + "timestamp": "2025-09-04 04:15:37.967921", + "step": 3345, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:15:38.045038", + "step": 3345, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0397348590195179, + "timestamp": "2025-09-04 04:15:38.058674", + "step": 3346, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:15:38.166347", + "step": 3346, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024191653355956078, + "timestamp": "2025-09-04 04:15:38.186670", + "step": 3347, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:15:38.302749", + "step": 3347, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025882297195494175, + "timestamp": "2025-09-04 04:15:38.325668", + "step": 3348, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:15:38.438407", + "step": 3348, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002427577506750822, + "timestamp": "2025-09-04 04:15:38.461108", + "step": 3349, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:15:38.539248", + "step": 3349, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003565673716366291, + "timestamp": "2025-09-04 04:15:38.553261", + "step": 3350, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:38.643791", + "step": 3350, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014742841012775898, + "timestamp": "2025-09-04 04:15:38.660494", + "step": 3351, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:15:38.770850", + "step": 3351, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006036367267370224, + "timestamp": "2025-09-04 04:15:38.792263", + "step": 3352, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:38.889981", + "step": 3352, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023486964404582977, + "timestamp": "2025-09-04 04:15:38.910660", + "step": 3353, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:15:39.018492", + "step": 3353, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008537087589502335, + "timestamp": "2025-09-04 04:15:39.038759", + "step": 3354, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:15:39.122884", + "step": 3354, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004986139247193933, + "timestamp": "2025-09-04 04:15:39.138020", + "step": 3355, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:39.245800", + "step": 3355, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03504369035363197, + "timestamp": "2025-09-04 04:15:39.266945", + "step": 3356, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:15:39.343131", + "step": 3356, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017468314617872238, + "timestamp": "2025-09-04 04:15:39.358533", + "step": 3357, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:39.461610", + "step": 3357, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005034047178924084, + "timestamp": "2025-09-04 04:15:39.480862", + "step": 3358, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:15:39.566712", + "step": 3358, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031177129596471786, + "timestamp": "2025-09-04 04:15:39.582311", + "step": 3359, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:15:39.684803", + "step": 3359, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004434007278177887, + "timestamp": "2025-09-04 04:15:39.704763", + "step": 3360, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:48.145682", + "step": 3360, + "epoch": 3 + }, + { + "type": "pplx", + "content": 301.0731658055851, + "timestamp": "2025-09-04 04:15:48.147942", + "step": 3360, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3360", + "timestamp": "2025-09-04 04:15:48.687287", + "step": 3360, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:15:48.767367", + "step": 3360, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026787512470036745, + "timestamp": "2025-09-04 04:15:48.783668", + "step": 3361, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:48.896544", + "step": 3361, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05144086107611656, + "timestamp": "2025-09-04 04:15:48.916900", + "step": 3362, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:15:49.012988", + "step": 3362, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00442865677177906, + "timestamp": "2025-09-04 04:15:49.030271", + "step": 3363, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:49.131246", + "step": 3363, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024153843987733126, + "timestamp": "2025-09-04 04:15:49.150792", + "step": 3364, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:15:49.247227", + "step": 3364, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00037681308458559215, + "timestamp": "2025-09-04 04:15:49.267476", + "step": 3365, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:15:49.361875", + "step": 3365, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004885226022452116, + "timestamp": "2025-09-04 04:15:49.378943", + "step": 3366, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:49.479578", + "step": 3366, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020293110981583595, + "timestamp": "2025-09-04 04:15:49.498362", + "step": 3367, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:15:49.601224", + "step": 3367, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02302255854010582, + "timestamp": "2025-09-04 04:15:49.620988", + "step": 3368, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:15:49.727729", + "step": 3368, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001401986082782969, + "timestamp": "2025-09-04 04:15:49.750154", + "step": 3369, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:15:49.859459", + "step": 3369, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03139295056462288, + "timestamp": "2025-09-04 04:15:49.879856", + "step": 3370, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:49.981889", + "step": 3370, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000647324079181999, + "timestamp": "2025-09-04 04:15:50.000762", + "step": 3371, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:15:50.071999", + "step": 3371, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020890971645712852, + "timestamp": "2025-09-04 04:15:50.085529", + "step": 3372, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:15:50.181131", + "step": 3372, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002276118320878595, + "timestamp": "2025-09-04 04:15:50.200295", + "step": 3373, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:15:50.306852", + "step": 3373, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021090731024742126, + "timestamp": "2025-09-04 04:15:50.326971", + "step": 3374, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:15:50.463072", + "step": 3374, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053464737720787525, + "timestamp": "2025-09-04 04:15:50.489249", + "step": 3375, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:15:50.600962", + "step": 3375, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00169714679941535, + "timestamp": "2025-09-04 04:15:50.622394", + "step": 3376, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:15:50.705172", + "step": 3376, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0071422443725168705, + "timestamp": "2025-09-04 04:15:50.722252", + "step": 3377, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:15:50.812122", + "step": 3377, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026480015367269516, + "timestamp": "2025-09-04 04:15:50.828896", + "step": 3378, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:50.928878", + "step": 3378, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017498015658929944, + "timestamp": "2025-09-04 04:15:50.947886", + "step": 3379, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:51.048104", + "step": 3379, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001030558254569769, + "timestamp": "2025-09-04 04:15:51.067707", + "step": 3380, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:15:59.446548", + "step": 3380, + "epoch": 3 + }, + { + "type": "pplx", + "content": 294.73432370067934, + "timestamp": "2025-09-04 04:15:59.448500", + "step": 3380, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:15:59.530511", + "step": 3380, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00033817789517343044, + "timestamp": "2025-09-04 04:15:59.547754", + "step": 3381, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:15:59.648370", + "step": 3381, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006461319862864912, + "timestamp": "2025-09-04 04:15:59.667244", + "step": 3382, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:15:59.752986", + "step": 3382, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027155397459864616, + "timestamp": "2025-09-04 04:15:59.768515", + "step": 3383, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:15:59.866742", + "step": 3383, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.07015629857778549, + "timestamp": "2025-09-04 04:15:59.884939", + "step": 3384, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:15:59.998615", + "step": 3384, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016538852360099554, + "timestamp": "2025-09-04 04:16:00.022906", + "step": 3385, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:00.122185", + "step": 3385, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.11791013926267624, + "timestamp": "2025-09-04 04:16:00.140724", + "step": 3386, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:16:00.228744", + "step": 3386, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014148048125207424, + "timestamp": "2025-09-04 04:16:00.244329", + "step": 3387, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:16:00.365436", + "step": 3387, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005752384895458817, + "timestamp": "2025-09-04 04:16:00.389346", + "step": 3388, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:00.490559", + "step": 3388, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020116129890084267, + "timestamp": "2025-09-04 04:16:00.511811", + "step": 3389, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:00.612654", + "step": 3389, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015090583823621273, + "timestamp": "2025-09-04 04:16:00.631434", + "step": 3390, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:16:00.726083", + "step": 3390, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0049343351274728775, + "timestamp": "2025-09-04 04:16:00.743678", + "step": 3391, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:00.843939", + "step": 3391, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000529682612977922, + "timestamp": "2025-09-04 04:16:00.863659", + "step": 3392, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:16:00.956813", + "step": 3392, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011004252592101693, + "timestamp": "2025-09-04 04:16:00.976139", + "step": 3393, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:01.070353", + "step": 3393, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02100779488682747, + "timestamp": "2025-09-04 04:16:01.087563", + "step": 3394, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:01.187918", + "step": 3394, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010034440085291862, + "timestamp": "2025-09-04 04:16:01.206884", + "step": 3395, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:01.299913", + "step": 3395, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04920711740851402, + "timestamp": "2025-09-04 04:16:01.317898", + "step": 3396, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:01.423832", + "step": 3396, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.041153181344270706, + "timestamp": "2025-09-04 04:16:01.445643", + "step": 3397, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:01.549800", + "step": 3397, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02985711395740509, + "timestamp": "2025-09-04 04:16:01.569180", + "step": 3398, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:01.668606", + "step": 3398, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000561038323212415, + "timestamp": "2025-09-04 04:16:01.687288", + "step": 3399, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:01.787984", + "step": 3399, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002825426869094372, + "timestamp": "2025-09-04 04:16:01.807598", + "step": 3400, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:16:10.365512", + "step": 3400, + "epoch": 3 + }, + { + "type": "pplx", + "content": 285.49062028489874, + "timestamp": "2025-09-04 04:16:10.371170", + "step": 3400, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3400", + "timestamp": "2025-09-04 04:16:10.757804", + "step": 3400, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:10.846129", + "step": 3400, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002441899385303259, + "timestamp": "2025-09-04 04:16:10.864175", + "step": 3401, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:16:10.962424", + "step": 3401, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007989015430212021, + "timestamp": "2025-09-04 04:16:10.979656", + "step": 3402, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:11.084061", + "step": 3402, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0034658664371818304, + "timestamp": "2025-09-04 04:16:11.103037", + "step": 3403, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:16:11.242472", + "step": 3403, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011140529066324234, + "timestamp": "2025-09-04 04:16:11.269313", + "step": 3404, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:11.359769", + "step": 3404, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007334363646805286, + "timestamp": "2025-09-04 04:16:11.377858", + "step": 3405, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:11.470040", + "step": 3405, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007528170011937618, + "timestamp": "2025-09-04 04:16:11.486570", + "step": 3406, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:11.591013", + "step": 3406, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002277525927638635, + "timestamp": "2025-09-04 04:16:11.609976", + "step": 3407, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:11.702970", + "step": 3407, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009056737762875855, + "timestamp": "2025-09-04 04:16:11.720292", + "step": 3408, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:11.822714", + "step": 3408, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007609906140714884, + "timestamp": "2025-09-04 04:16:11.843614", + "step": 3409, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:11.949959", + "step": 3409, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008335032500326633, + "timestamp": "2025-09-04 04:16:11.969001", + "step": 3410, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:16:12.048036", + "step": 3410, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005043984274379909, + "timestamp": "2025-09-04 04:16:12.061607", + "step": 3411, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:12.174289", + "step": 3411, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005011504516005516, + "timestamp": "2025-09-04 04:16:12.195379", + "step": 3412, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:12.294306", + "step": 3412, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030210444703698158, + "timestamp": "2025-09-04 04:16:12.314552", + "step": 3413, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:12.420677", + "step": 3413, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011688501108437777, + "timestamp": "2025-09-04 04:16:12.439751", + "step": 3414, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:12.554307", + "step": 3414, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014814517460763454, + "timestamp": "2025-09-04 04:16:12.573324", + "step": 3415, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:12.676000", + "step": 3415, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022736500948667526, + "timestamp": "2025-09-04 04:16:12.695440", + "step": 3416, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:12.797542", + "step": 3416, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009403983131051064, + "timestamp": "2025-09-04 04:16:12.818416", + "step": 3417, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:16:12.930686", + "step": 3417, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003918331582099199, + "timestamp": "2025-09-04 04:16:12.951167", + "step": 3418, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:16:13.070130", + "step": 3418, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002362527186051011, + "timestamp": "2025-09-04 04:16:13.092009", + "step": 3419, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:13.199697", + "step": 3419, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011338985525071621, + "timestamp": "2025-09-04 04:16:13.220218", + "step": 3420, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:16:21.725284", + "step": 3420, + "epoch": 3 + }, + { + "type": "pplx", + "content": 281.34383504857846, + "timestamp": "2025-09-04 04:16:21.727582", + "step": 3420, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:21.826125", + "step": 3420, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003105961252003908, + "timestamp": "2025-09-04 04:16:21.847337", + "step": 3421, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:16:21.932865", + "step": 3421, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025257884990423918, + "timestamp": "2025-09-04 04:16:21.948610", + "step": 3422, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:22.040843", + "step": 3422, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005112270591780543, + "timestamp": "2025-09-04 04:16:22.057951", + "step": 3423, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 04:16:22.241785", + "step": 3423, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012885574251413345, + "timestamp": "2025-09-04 04:16:22.277227", + "step": 3424, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:22.368483", + "step": 3424, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007698435802012682, + "timestamp": "2025-09-04 04:16:22.387279", + "step": 3425, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:22.496149", + "step": 3425, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006365908775478601, + "timestamp": "2025-09-04 04:16:22.516680", + "step": 3426, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:22.619441", + "step": 3426, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026056919246912003, + "timestamp": "2025-09-04 04:16:22.638651", + "step": 3427, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:16:22.722696", + "step": 3427, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0844995528459549, + "timestamp": "2025-09-04 04:16:22.738555", + "step": 3428, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:16:22.822651", + "step": 3428, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028854222036898136, + "timestamp": "2025-09-04 04:16:22.839858", + "step": 3429, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:22.929402", + "step": 3429, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000976667390204966, + "timestamp": "2025-09-04 04:16:22.946348", + "step": 3430, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:23.044574", + "step": 3430, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019412081688642502, + "timestamp": "2025-09-04 04:16:23.063135", + "step": 3431, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:23.162180", + "step": 3431, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002543810987845063, + "timestamp": "2025-09-04 04:16:23.181583", + "step": 3432, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1248 + ], + "flops": 24960151589760.0 + }, + "timestamp": "2025-09-04 04:16:23.360867", + "step": 3432, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0077868239022791386, + "timestamp": "2025-09-04 04:16:23.398858", + "step": 3433, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:23.498788", + "step": 3433, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018246417166665196, + "timestamp": "2025-09-04 04:16:23.517327", + "step": 3434, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:16:23.594959", + "step": 3434, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03193148225545883, + "timestamp": "2025-09-04 04:16:23.609001", + "step": 3435, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:23.711499", + "step": 3435, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0072656129486858845, + "timestamp": "2025-09-04 04:16:23.731140", + "step": 3436, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:23.830096", + "step": 3436, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032099627424031496, + "timestamp": "2025-09-04 04:16:23.850782", + "step": 3437, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:16:23.945449", + "step": 3437, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004939934704452753, + "timestamp": "2025-09-04 04:16:23.962953", + "step": 3438, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:24.065224", + "step": 3438, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003717708634212613, + "timestamp": "2025-09-04 04:16:24.084445", + "step": 3439, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:24.192910", + "step": 3439, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010961350053548813, + "timestamp": "2025-09-04 04:16:24.214147", + "step": 3440, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:16:32.619010", + "step": 3440, + "epoch": 3 + }, + { + "type": "pplx", + "content": 276.0165633644801, + "timestamp": "2025-09-04 04:16:32.621440", + "step": 3440, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3440", + "timestamp": "2025-09-04 04:16:33.156281", + "step": 3440, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:33.245723", + "step": 3440, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005490960320457816, + "timestamp": "2025-09-04 04:16:33.264393", + "step": 3441, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:16:33.338981", + "step": 3441, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026552164927124977, + "timestamp": "2025-09-04 04:16:33.352587", + "step": 3442, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:16:33.460260", + "step": 3442, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013740723952651024, + "timestamp": "2025-09-04 04:16:33.480498", + "step": 3443, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:33.590052", + "step": 3443, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005307029001414776, + "timestamp": "2025-09-04 04:16:33.611410", + "step": 3444, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:33.702176", + "step": 3444, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018815629184246063, + "timestamp": "2025-09-04 04:16:33.720515", + "step": 3445, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:33.821133", + "step": 3445, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011458718217909336, + "timestamp": "2025-09-04 04:16:33.839998", + "step": 3446, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1408 + ], + "flops": 28160171015680.0 + }, + "timestamp": "2025-09-04 04:16:34.047495", + "step": 3446, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004305221955291927, + "timestamp": "2025-09-04 04:16:34.086809", + "step": 3447, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:34.188269", + "step": 3447, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004843763541430235, + "timestamp": "2025-09-04 04:16:34.208262", + "step": 3448, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:16:34.314432", + "step": 3448, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005847205873578787, + "timestamp": "2025-09-04 04:16:34.337045", + "step": 3449, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:16:34.419987", + "step": 3449, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025259602814912796, + "timestamp": "2025-09-04 04:16:34.435272", + "step": 3450, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:16:34.518599", + "step": 3450, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003980662208050489, + "timestamp": "2025-09-04 04:16:34.533632", + "step": 3451, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:16:34.627948", + "step": 3451, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030712997540831566, + "timestamp": "2025-09-04 04:16:34.646239", + "step": 3452, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:34.748110", + "step": 3452, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.049204930663108826, + "timestamp": "2025-09-04 04:16:34.769245", + "step": 3453, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:34.860524", + "step": 3453, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00023195317771751434, + "timestamp": "2025-09-04 04:16:34.877272", + "step": 3454, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:34.976625", + "step": 3454, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005655570421367884, + "timestamp": "2025-09-04 04:16:34.995186", + "step": 3455, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:16:35.072036", + "step": 3455, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009977075271308422, + "timestamp": "2025-09-04 04:16:35.086890", + "step": 3456, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:35.177118", + "step": 3456, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017296381294727325, + "timestamp": "2025-09-04 04:16:35.195820", + "step": 3457, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:35.286844", + "step": 3457, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008046641014516354, + "timestamp": "2025-09-04 04:16:35.303590", + "step": 3458, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:35.412460", + "step": 3458, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03136194869875908, + "timestamp": "2025-09-04 04:16:35.432984", + "step": 3459, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:35.536842", + "step": 3459, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00904142763465643, + "timestamp": "2025-09-04 04:16:35.556939", + "step": 3460, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:16:43.938279", + "step": 3460, + "epoch": 3 + }, + { + "type": "pplx", + "content": 273.344454429984, + "timestamp": "2025-09-04 04:16:43.940762", + "step": 3460, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:44.042144", + "step": 3460, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006052535958588123, + "timestamp": "2025-09-04 04:16:44.064039", + "step": 3461, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:44.167096", + "step": 3461, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018702381057664752, + "timestamp": "2025-09-04 04:16:44.186364", + "step": 3462, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:44.289805", + "step": 3462, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006505020894110203, + "timestamp": "2025-09-04 04:16:44.309039", + "step": 3463, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:44.414600", + "step": 3463, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006749553140252829, + "timestamp": "2025-09-04 04:16:44.435284", + "step": 3464, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:44.527480", + "step": 3464, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011513489298522472, + "timestamp": "2025-09-04 04:16:44.546415", + "step": 3465, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:44.649578", + "step": 3465, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009676741319708526, + "timestamp": "2025-09-04 04:16:44.668399", + "step": 3466, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 04:16:44.799523", + "step": 3466, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007584612467326224, + "timestamp": "2025-09-04 04:16:44.824173", + "step": 3467, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:44.925235", + "step": 3467, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.051189690828323364, + "timestamp": "2025-09-04 04:16:44.944884", + "step": 3468, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:45.043356", + "step": 3468, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004076420795172453, + "timestamp": "2025-09-04 04:16:45.063783", + "step": 3469, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:45.169858", + "step": 3469, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01913578435778618, + "timestamp": "2025-09-04 04:16:45.189959", + "step": 3470, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:16:45.283963", + "step": 3470, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021679247729480267, + "timestamp": "2025-09-04 04:16:45.301246", + "step": 3471, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:16:45.397107", + "step": 3471, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00030914912349544466, + "timestamp": "2025-09-04 04:16:45.413267", + "step": 3472, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:16:45.495736", + "step": 3472, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0001749959192238748, + "timestamp": "2025-09-04 04:16:45.512249", + "step": 3473, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:45.616646", + "step": 3473, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008923078887164593, + "timestamp": "2025-09-04 04:16:45.635799", + "step": 3474, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:45.742204", + "step": 3474, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01262232568114996, + "timestamp": "2025-09-04 04:16:45.762090", + "step": 3475, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:45.864266", + "step": 3475, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005357048008590937, + "timestamp": "2025-09-04 04:16:45.883695", + "step": 3476, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:45.990063", + "step": 3476, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005004971753805876, + "timestamp": "2025-09-04 04:16:46.012610", + "step": 3477, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:46.112508", + "step": 3477, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004670882131904364, + "timestamp": "2025-09-04 04:16:46.131186", + "step": 3478, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:46.240850", + "step": 3478, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001415494829416275, + "timestamp": "2025-09-04 04:16:46.261391", + "step": 3479, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:46.354980", + "step": 3479, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019339878344908357, + "timestamp": "2025-09-04 04:16:46.372907", + "step": 3480, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:16:54.758650", + "step": 3480, + "epoch": 3 + }, + { + "type": "pplx", + "content": 273.04229426249503, + "timestamp": "2025-09-04 04:16:54.760818", + "step": 3480, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3480", + "timestamp": "2025-09-04 04:16:55.106808", + "step": 3480, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:55.197784", + "step": 3480, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007648364640772343, + "timestamp": "2025-09-04 04:16:55.216546", + "step": 3481, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:16:55.311636", + "step": 3481, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003384833922609687, + "timestamp": "2025-09-04 04:16:55.329096", + "step": 3482, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:55.421525", + "step": 3482, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017845932161435485, + "timestamp": "2025-09-04 04:16:55.438702", + "step": 3483, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:55.542236", + "step": 3483, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.030296683311462402, + "timestamp": "2025-09-04 04:16:55.562301", + "step": 3484, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:55.659953", + "step": 3484, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010239890776574612, + "timestamp": "2025-09-04 04:16:55.680631", + "step": 3485, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:16:55.773577", + "step": 3485, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01184002310037613, + "timestamp": "2025-09-04 04:16:55.790695", + "step": 3486, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:55.896552", + "step": 3486, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026469461154192686, + "timestamp": "2025-09-04 04:16:55.916558", + "step": 3487, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:56.020152", + "step": 3487, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006876929197460413, + "timestamp": "2025-09-04 04:16:56.040234", + "step": 3488, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:16:56.147712", + "step": 3488, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012749651446938515, + "timestamp": "2025-09-04 04:16:56.170254", + "step": 3489, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:56.273664", + "step": 3489, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00782372523099184, + "timestamp": "2025-09-04 04:16:56.292933", + "step": 3490, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:16:56.378232", + "step": 3490, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010486006503924727, + "timestamp": "2025-09-04 04:16:56.393419", + "step": 3491, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:56.493707", + "step": 3491, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012736011995002627, + "timestamp": "2025-09-04 04:16:56.513374", + "step": 3492, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:16:56.611057", + "step": 3492, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019962089136242867, + "timestamp": "2025-09-04 04:16:56.631424", + "step": 3493, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:16:56.734929", + "step": 3493, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016266385093331337, + "timestamp": "2025-09-04 04:16:56.754022", + "step": 3494, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:56.856315", + "step": 3494, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004476721398532391, + "timestamp": "2025-09-04 04:16:56.875280", + "step": 3495, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:16:56.975567", + "step": 3495, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004053797747474164, + "timestamp": "2025-09-04 04:16:56.995298", + "step": 3496, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:57.095271", + "step": 3496, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016851192340254784, + "timestamp": "2025-09-04 04:16:57.116407", + "step": 3497, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:16:57.223991", + "step": 3497, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002290160395205021, + "timestamp": "2025-09-04 04:16:57.244053", + "step": 3498, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:16:57.334762", + "step": 3498, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009426870383322239, + "timestamp": "2025-09-04 04:16:57.351647", + "step": 3499, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:16:57.454684", + "step": 3499, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010544263757765293, + "timestamp": "2025-09-04 04:16:57.474898", + "step": 3500, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:17:05.851374", + "step": 3500, + "epoch": 3 + }, + { + "type": "pplx", + "content": 278.8655476522292, + "timestamp": "2025-09-04 04:17:05.854405", + "step": 3500, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:05.952478", + "step": 3500, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006872169207781553, + "timestamp": "2025-09-04 04:17:05.973642", + "step": 3501, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:06.067259", + "step": 3501, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010031554847955704, + "timestamp": "2025-09-04 04:17:06.084553", + "step": 3502, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:06.174657", + "step": 3502, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005660749971866608, + "timestamp": "2025-09-04 04:17:06.191462", + "step": 3503, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:17:06.276212", + "step": 3503, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009915877133607864, + "timestamp": "2025-09-04 04:17:06.292393", + "step": 3504, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:06.383845", + "step": 3504, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004943343810737133, + "timestamp": "2025-09-04 04:17:06.403017", + "step": 3505, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:06.504669", + "step": 3505, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015765562420710921, + "timestamp": "2025-09-04 04:17:06.523796", + "step": 3506, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:06.616657", + "step": 3506, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0038354985881596804, + "timestamp": "2025-09-04 04:17:06.633845", + "step": 3507, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:06.750740", + "step": 3507, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014609359204769135, + "timestamp": "2025-09-04 04:17:06.770698", + "step": 3508, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:06.873650", + "step": 3508, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002754951361566782, + "timestamp": "2025-09-04 04:17:06.895603", + "step": 3509, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:06.985320", + "step": 3509, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.05803408473730087, + "timestamp": "2025-09-04 04:17:07.002134", + "step": 3510, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:07.104670", + "step": 3510, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002227720571681857, + "timestamp": "2025-09-04 04:17:07.123858", + "step": 3511, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:07.224100", + "step": 3511, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00011479722161311656, + "timestamp": "2025-09-04 04:17:07.243742", + "step": 3512, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:07.334422", + "step": 3512, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0112196309491992, + "timestamp": "2025-09-04 04:17:07.353494", + "step": 3513, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:07.447008", + "step": 3513, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00041249426431022584, + "timestamp": "2025-09-04 04:17:07.464442", + "step": 3514, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:07.567901", + "step": 3514, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003908245358616114, + "timestamp": "2025-09-04 04:17:07.587156", + "step": 3515, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:07.678752", + "step": 3515, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04177214950323105, + "timestamp": "2025-09-04 04:17:07.696276", + "step": 3516, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:07.795192", + "step": 3516, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.060547590255737305, + "timestamp": "2025-09-04 04:17:07.815910", + "step": 3517, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:07.917711", + "step": 3517, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007211702759377658, + "timestamp": "2025-09-04 04:17:07.936836", + "step": 3518, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:17:08.071144", + "step": 3518, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037805659230798483, + "timestamp": "2025-09-04 04:17:08.097057", + "step": 3519, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:08.190982", + "step": 3519, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007106819190084934, + "timestamp": "2025-09-04 04:17:08.209152", + "step": 3520, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:17:16.652692", + "step": 3520, + "epoch": 3 + }, + { + "type": "pplx", + "content": 285.4826197362545, + "timestamp": "2025-09-04 04:17:16.656011", + "step": 3520, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3520", + "timestamp": "2025-09-04 04:17:17.176950", + "step": 3520, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:17.280941", + "step": 3520, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00228358106687665, + "timestamp": "2025-09-04 04:17:17.302185", + "step": 3521, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:17:17.387962", + "step": 3521, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005259730387479067, + "timestamp": "2025-09-04 04:17:17.401639", + "step": 3522, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:17.494562", + "step": 3522, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028661582618951797, + "timestamp": "2025-09-04 04:17:17.511807", + "step": 3523, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:17:17.646766", + "step": 3523, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0053170472383499146, + "timestamp": "2025-09-04 04:17:17.673610", + "step": 3524, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:17.776209", + "step": 3524, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010155066847801208, + "timestamp": "2025-09-04 04:17:17.797350", + "step": 3525, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:17:17.893899", + "step": 3525, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011916323564946651, + "timestamp": "2025-09-04 04:17:17.911517", + "step": 3526, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:18.002884", + "step": 3526, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006236597546376288, + "timestamp": "2025-09-04 04:17:18.019750", + "step": 3527, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:17:18.095835", + "step": 3527, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007811339106410742, + "timestamp": "2025-09-04 04:17:18.110506", + "step": 3528, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:17:18.207549", + "step": 3528, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019810060039162636, + "timestamp": "2025-09-04 04:17:18.227876", + "step": 3529, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:17:18.339268", + "step": 3529, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015669839922338724, + "timestamp": "2025-09-04 04:17:18.359806", + "step": 3530, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:18.457156", + "step": 3530, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.022024275735020638, + "timestamp": "2025-09-04 04:17:18.474719", + "step": 3531, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:17:18.606447", + "step": 3531, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00185906991828233, + "timestamp": "2025-09-04 04:17:18.630444", + "step": 3532, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:18.730940", + "step": 3532, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033666298259049654, + "timestamp": "2025-09-04 04:17:18.752102", + "step": 3533, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 04:17:18.926745", + "step": 3533, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011104767909273505, + "timestamp": "2025-09-04 04:17:18.959381", + "step": 3534, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:19.067563", + "step": 3534, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017678681761026382, + "timestamp": "2025-09-04 04:17:19.087547", + "step": 3535, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:19.195375", + "step": 3535, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003609925974160433, + "timestamp": "2025-09-04 04:17:19.216154", + "step": 3536, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:19.304973", + "step": 3536, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021984629333019257, + "timestamp": "2025-09-04 04:17:19.323469", + "step": 3537, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:19.429598", + "step": 3537, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00020458322251215577, + "timestamp": "2025-09-04 04:17:19.449567", + "step": 3538, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:17:19.535223", + "step": 3538, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01300242729485035, + "timestamp": "2025-09-04 04:17:19.550684", + "step": 3539, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:19.667433", + "step": 3539, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01605132967233658, + "timestamp": "2025-09-04 04:17:19.688333", + "step": 3540, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:17:28.163839", + "step": 3540, + "epoch": 3 + }, + { + "type": "pplx", + "content": 287.8090877611623, + "timestamp": "2025-09-04 04:17:28.165953", + "step": 3540, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:28.267471", + "step": 3540, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015539666637778282, + "timestamp": "2025-09-04 04:17:28.289355", + "step": 3541, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:17:28.493842", + "step": 3541, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029507221654057503, + "timestamp": "2025-09-04 04:17:28.532966", + "step": 3542, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:17:28.612593", + "step": 3542, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015970682725310326, + "timestamp": "2025-09-04 04:17:28.626758", + "step": 3543, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:17:28.744337", + "step": 3543, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008442920632660389, + "timestamp": "2025-09-04 04:17:28.767276", + "step": 3544, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:17:28.873521", + "step": 3544, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.027983790263533592, + "timestamp": "2025-09-04 04:17:28.895818", + "step": 3545, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:28.987931", + "step": 3545, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004234362568240613, + "timestamp": "2025-09-04 04:17:29.004675", + "step": 3546, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:17:29.090019", + "step": 3546, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007317407871596515, + "timestamp": "2025-09-04 04:17:29.105455", + "step": 3547, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:29.208608", + "step": 3547, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00967488158494234, + "timestamp": "2025-09-04 04:17:29.228533", + "step": 3548, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:17:29.321831", + "step": 3548, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02515988238155842, + "timestamp": "2025-09-04 04:17:29.341037", + "step": 3549, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 04:17:29.514992", + "step": 3549, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000993523863144219, + "timestamp": "2025-09-04 04:17:29.549641", + "step": 3550, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:29.645946", + "step": 3550, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000763634976465255, + "timestamp": "2025-09-04 04:17:29.663104", + "step": 3551, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:17:29.747319", + "step": 3551, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000361975806299597, + "timestamp": "2025-09-04 04:17:29.763321", + "step": 3552, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:17:29.846943", + "step": 3552, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030570703092962503, + "timestamp": "2025-09-04 04:17:29.864182", + "step": 3553, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:17:29.941189", + "step": 3553, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0028307621832937002, + "timestamp": "2025-09-04 04:17:29.955338", + "step": 3554, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:30.060042", + "step": 3554, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015227803960442543, + "timestamp": "2025-09-04 04:17:30.079268", + "step": 3555, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:17:30.184667", + "step": 3555, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013519185595214367, + "timestamp": "2025-09-04 04:17:30.202939", + "step": 3556, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:17:30.308622", + "step": 3556, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007381472387351096, + "timestamp": "2025-09-04 04:17:30.330915", + "step": 3557, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:17:30.534526", + "step": 3557, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012235705507919192, + "timestamp": "2025-09-04 04:17:30.573628", + "step": 3558, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:30.683910", + "step": 3558, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007621685042977333, + "timestamp": "2025-09-04 04:17:30.703119", + "step": 3559, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:17:30.808101", + "step": 3559, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009364414145238698, + "timestamp": "2025-09-04 04:17:30.827407", + "step": 3560, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:17:39.214990", + "step": 3560, + "epoch": 3 + }, + { + "type": "pplx", + "content": 289.38046074506144, + "timestamp": "2025-09-04 04:17:39.216730", + "step": 3560, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3560", + "timestamp": "2025-09-04 04:17:39.578194", + "step": 3560, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:17:39.694895", + "step": 3560, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.09077467024326324, + "timestamp": "2025-09-04 04:17:39.718681", + "step": 3561, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:39.822623", + "step": 3561, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001359087647870183, + "timestamp": "2025-09-04 04:17:39.841874", + "step": 3562, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:39.944941", + "step": 3562, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016831067623570561, + "timestamp": "2025-09-04 04:17:39.964001", + "step": 3563, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:17:40.059536", + "step": 3563, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04928234592080116, + "timestamp": "2025-09-04 04:17:40.077782", + "step": 3564, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:17:40.170060", + "step": 3564, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02024707943201065, + "timestamp": "2025-09-04 04:17:40.189078", + "step": 3565, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:40.289261", + "step": 3565, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02083570696413517, + "timestamp": "2025-09-04 04:17:40.308255", + "step": 3566, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:40.409586", + "step": 3566, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006988304201513529, + "timestamp": "2025-09-04 04:17:40.428713", + "step": 3567, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:40.532535", + "step": 3567, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005976350978016853, + "timestamp": "2025-09-04 04:17:40.552587", + "step": 3568, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:17:40.625938", + "step": 3568, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006792505271732807, + "timestamp": "2025-09-04 04:17:40.640617", + "step": 3569, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:40.742711", + "step": 3569, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004221724346280098, + "timestamp": "2025-09-04 04:17:40.761826", + "step": 3570, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:40.863583", + "step": 3570, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009288810542784631, + "timestamp": "2025-09-04 04:17:40.882442", + "step": 3571, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:17:40.982227", + "step": 3571, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005838017095811665, + "timestamp": "2025-09-04 04:17:41.001628", + "step": 3572, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:41.090099", + "step": 3572, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032299798913300037, + "timestamp": "2025-09-04 04:17:41.108485", + "step": 3573, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:17:41.186326", + "step": 3573, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007045165402814746, + "timestamp": "2025-09-04 04:17:41.200322", + "step": 3574, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1472 + ], + "flops": 29440178786048.0 + }, + "timestamp": "2025-09-04 04:17:41.415846", + "step": 3574, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004880525637418032, + "timestamp": "2025-09-04 04:17:41.456770", + "step": 3575, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:17:41.552296", + "step": 3575, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01930762641131878, + "timestamp": "2025-09-04 04:17:41.570598", + "step": 3576, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:17:41.646968", + "step": 3576, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006385389715433121, + "timestamp": "2025-09-04 04:17:41.662283", + "step": 3577, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:41.765485", + "step": 3577, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004767129663378, + "timestamp": "2025-09-04 04:17:41.784700", + "step": 3578, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 864 + ], + "flops": 17280104967552.0 + }, + "timestamp": "2025-09-04 04:17:41.912097", + "step": 3578, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000834185048006475, + "timestamp": "2025-09-04 04:17:41.936484", + "step": 3579, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:42.040776", + "step": 3579, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00024679276975803077, + "timestamp": "2025-09-04 04:17:42.060779", + "step": 3580, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:17:50.483560", + "step": 3580, + "epoch": 3 + }, + { + "type": "pplx", + "content": 284.2367375279463, + "timestamp": "2025-09-04 04:17:50.485827", + "step": 3580, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:17:50.560931", + "step": 3580, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006078935693949461, + "timestamp": "2025-09-04 04:17:50.576216", + "step": 3581, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:17:50.681804", + "step": 3581, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0132825942710042, + "timestamp": "2025-09-04 04:17:50.700909", + "step": 3582, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:50.804294", + "step": 3582, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012423519045114517, + "timestamp": "2025-09-04 04:17:50.823176", + "step": 3583, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:17:50.903122", + "step": 3583, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00610441155731678, + "timestamp": "2025-09-04 04:17:50.918030", + "step": 3584, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:51.016416", + "step": 3584, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03366504982113838, + "timestamp": "2025-09-04 04:17:51.037152", + "step": 3585, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:17:51.145741", + "step": 3585, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012228694977238774, + "timestamp": "2025-09-04 04:17:51.166017", + "step": 3586, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:51.257368", + "step": 3586, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005438943044282496, + "timestamp": "2025-09-04 04:17:51.274200", + "step": 3587, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:17:51.362295", + "step": 3587, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007683582603931427, + "timestamp": "2025-09-04 04:17:51.378679", + "step": 3588, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:17:51.469388", + "step": 3588, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010738805867731571, + "timestamp": "2025-09-04 04:17:51.487793", + "step": 3589, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:17:51.563545", + "step": 3589, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031101806089282036, + "timestamp": "2025-09-04 04:17:51.577287", + "step": 3590, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:51.676508", + "step": 3590, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.09019530564546585, + "timestamp": "2025-09-04 04:17:51.693617", + "step": 3591, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:51.794524", + "step": 3591, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.17219799757003784, + "timestamp": "2025-09-04 04:17:51.814168", + "step": 3592, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:17:51.915283", + "step": 3592, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005689322482794523, + "timestamp": "2025-09-04 04:17:51.936352", + "step": 3593, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:52.032479", + "step": 3593, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.018983401358127594, + "timestamp": "2025-09-04 04:17:52.049611", + "step": 3594, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:52.150681", + "step": 3594, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016854063142091036, + "timestamp": "2025-09-04 04:17:52.169545", + "step": 3595, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:17:52.277625", + "step": 3595, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0018568774685263634, + "timestamp": "2025-09-04 04:17:52.298316", + "step": 3596, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:52.396728", + "step": 3596, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010966422269120812, + "timestamp": "2025-09-04 04:17:52.417397", + "step": 3597, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:17:52.510543", + "step": 3597, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005582905374467373, + "timestamp": "2025-09-04 04:17:52.527659", + "step": 3598, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:17:52.630415", + "step": 3598, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006567910313606262, + "timestamp": "2025-09-04 04:17:52.649399", + "step": 3599, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:17:52.760999", + "step": 3599, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.021000593900680542, + "timestamp": "2025-09-04 04:17:52.782422", + "step": 3600, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:01.223292", + "step": 3600, + "epoch": 3 + }, + { + "type": "pplx", + "content": 273.946033699607, + "timestamp": "2025-09-04 04:18:01.225358", + "step": 3600, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3600", + "timestamp": "2025-09-04 04:18:01.586167", + "step": 3600, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:18:01.659422", + "step": 3600, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009026916697621346, + "timestamp": "2025-09-04 04:18:01.674374", + "step": 3601, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:01.776530", + "step": 3601, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015143795171752572, + "timestamp": "2025-09-04 04:18:01.795323", + "step": 3602, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:18:01.880426", + "step": 3602, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012896225787699223, + "timestamp": "2025-09-04 04:18:01.895890", + "step": 3603, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:18:01.989210", + "step": 3603, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014396066777408123, + "timestamp": "2025-09-04 04:18:02.007200", + "step": 3604, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:02.097931", + "step": 3604, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015026642940938473, + "timestamp": "2025-09-04 04:18:02.117140", + "step": 3605, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:02.208664", + "step": 3605, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0049162269569933414, + "timestamp": "2025-09-04 04:18:02.225558", + "step": 3606, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:02.329504", + "step": 3606, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029911664314568043, + "timestamp": "2025-09-04 04:18:02.348781", + "step": 3607, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:18:02.458731", + "step": 3607, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005600781179964542, + "timestamp": "2025-09-04 04:18:02.480005", + "step": 3608, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:18:02.570782", + "step": 3608, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017589996568858624, + "timestamp": "2025-09-04 04:18:02.589673", + "step": 3609, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:02.692776", + "step": 3609, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030466015450656414, + "timestamp": "2025-09-04 04:18:02.711967", + "step": 3610, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:02.815915", + "step": 3610, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000875060330145061, + "timestamp": "2025-09-04 04:18:02.835190", + "step": 3611, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:02.930579", + "step": 3611, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000565591617487371, + "timestamp": "2025-09-04 04:18:02.948740", + "step": 3612, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:03.048836", + "step": 3612, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001949289464391768, + "timestamp": "2025-09-04 04:18:03.069950", + "step": 3613, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:18:03.179274", + "step": 3613, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002045362489297986, + "timestamp": "2025-09-04 04:18:03.199578", + "step": 3614, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:03.303512", + "step": 3614, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025945372879505157, + "timestamp": "2025-09-04 04:18:03.322728", + "step": 3615, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:18:03.409357", + "step": 3615, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003748838324099779, + "timestamp": "2025-09-04 04:18:03.425852", + "step": 3616, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:18:03.508941", + "step": 3616, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031195957213640213, + "timestamp": "2025-09-04 04:18:03.525966", + "step": 3617, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:03.617028", + "step": 3617, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037481507752090693, + "timestamp": "2025-09-04 04:18:03.633901", + "step": 3618, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:03.734574", + "step": 3618, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0022598407231271267, + "timestamp": "2025-09-04 04:18:03.753419", + "step": 3619, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:03.856714", + "step": 3619, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017032302916049957, + "timestamp": "2025-09-04 04:18:03.876386", + "step": 3620, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:12.287116", + "step": 3620, + "epoch": 3 + }, + { + "type": "pplx", + "content": 259.11186362945296, + "timestamp": "2025-09-04 04:18:12.288980", + "step": 3620, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:18:12.368787", + "step": 3620, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005365411285310984, + "timestamp": "2025-09-04 04:18:12.385332", + "step": 3621, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:18:12.495586", + "step": 3621, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03964783623814583, + "timestamp": "2025-09-04 04:18:12.515826", + "step": 3622, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:18:12.651939", + "step": 3622, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005470898933708668, + "timestamp": "2025-09-04 04:18:12.678090", + "step": 3623, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:12.779972", + "step": 3623, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0020204551983624697, + "timestamp": "2025-09-04 04:18:12.799866", + "step": 3624, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:18:12.903116", + "step": 3624, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004941796418279409, + "timestamp": "2025-09-04 04:18:12.924916", + "step": 3625, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:13.028080", + "step": 3625, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00373165775090456, + "timestamp": "2025-09-04 04:18:13.047297", + "step": 3626, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:13.146084", + "step": 3626, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00431425916031003, + "timestamp": "2025-09-04 04:18:13.164715", + "step": 3627, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:18:13.247659", + "step": 3627, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009402657859027386, + "timestamp": "2025-09-04 04:18:13.263508", + "step": 3628, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:18:13.370905", + "step": 3628, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007547914865426719, + "timestamp": "2025-09-04 04:18:13.393375", + "step": 3629, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:18:13.487383", + "step": 3629, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023410057183355093, + "timestamp": "2025-09-04 04:18:13.504478", + "step": 3630, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:13.605540", + "step": 3630, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008979837410151958, + "timestamp": "2025-09-04 04:18:13.624342", + "step": 3631, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1168 + ], + "flops": 23360141876800.0 + }, + "timestamp": "2025-09-04 04:18:13.800587", + "step": 3631, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013831626856699586, + "timestamp": "2025-09-04 04:18:13.833973", + "step": 3632, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:18:13.941465", + "step": 3632, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000762230425607413, + "timestamp": "2025-09-04 04:18:13.964016", + "step": 3633, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:14.067950", + "step": 3633, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001655717147514224, + "timestamp": "2025-09-04 04:18:14.087177", + "step": 3634, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:18:14.165208", + "step": 3634, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00832951907068491, + "timestamp": "2025-09-04 04:18:14.179160", + "step": 3635, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:14.283512", + "step": 3635, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02003934420645237, + "timestamp": "2025-09-04 04:18:14.303559", + "step": 3636, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:14.404763", + "step": 3636, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003556882031261921, + "timestamp": "2025-09-04 04:18:14.425768", + "step": 3637, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:18:14.536861", + "step": 3637, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010267372243106365, + "timestamp": "2025-09-04 04:18:14.557550", + "step": 3638, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:14.659506", + "step": 3638, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004329850431531668, + "timestamp": "2025-09-04 04:18:14.678378", + "step": 3639, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:14.779574", + "step": 3639, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016127810813486576, + "timestamp": "2025-09-04 04:18:14.799189", + "step": 3640, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:23.369155", + "step": 3640, + "epoch": 3 + }, + { + "type": "pplx", + "content": 255.24736725474492, + "timestamp": "2025-09-04 04:18:23.373987", + "step": 3640, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3640", + "timestamp": "2025-09-04 04:18:23.755326", + "step": 3640, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:18:23.832501", + "step": 3640, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019840069580823183, + "timestamp": "2025-09-04 04:18:23.847882", + "step": 3641, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:23.950295", + "step": 3641, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011246290989220142, + "timestamp": "2025-09-04 04:18:23.969312", + "step": 3642, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 880 + ], + "flops": 17600106910144.0 + }, + "timestamp": "2025-09-04 04:18:24.100055", + "step": 3642, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008128046174533665, + "timestamp": "2025-09-04 04:18:24.123694", + "step": 3643, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:24.217210", + "step": 3643, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002894132863730192, + "timestamp": "2025-09-04 04:18:24.234773", + "step": 3644, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1008 + ], + "flops": 20160122450880.0 + }, + "timestamp": "2025-09-04 04:18:24.379481", + "step": 3644, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007267083274200559, + "timestamp": "2025-09-04 04:18:24.410597", + "step": 3645, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:24.514114", + "step": 3645, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003020522417500615, + "timestamp": "2025-09-04 04:18:24.533436", + "step": 3646, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:18:24.642738", + "step": 3646, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007278635632246733, + "timestamp": "2025-09-04 04:18:24.663068", + "step": 3647, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:24.762487", + "step": 3647, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020761605352163315, + "timestamp": "2025-09-04 04:18:24.781998", + "step": 3648, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:24.885520", + "step": 3648, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007095049950294197, + "timestamp": "2025-09-04 04:18:24.906792", + "step": 3649, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:25.028566", + "step": 3649, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002251858590170741, + "timestamp": "2025-09-04 04:18:25.047729", + "step": 3650, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:18:25.126742", + "step": 3650, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013295934768393636, + "timestamp": "2025-09-04 04:18:25.140980", + "step": 3651, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:25.231776", + "step": 3651, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023495763540267944, + "timestamp": "2025-09-04 04:18:25.249318", + "step": 3652, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:18:25.366140", + "step": 3652, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012384490109980106, + "timestamp": "2025-09-04 04:18:25.390466", + "step": 3653, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:25.484835", + "step": 3653, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0022683092392981052, + "timestamp": "2025-09-04 04:18:25.502380", + "step": 3654, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:18:25.577592", + "step": 3654, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017986752791330218, + "timestamp": "2025-09-04 04:18:25.591074", + "step": 3655, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:25.694822", + "step": 3655, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007066111546009779, + "timestamp": "2025-09-04 04:18:25.714876", + "step": 3656, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:25.811673", + "step": 3656, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002684858627617359, + "timestamp": "2025-09-04 04:18:25.832055", + "step": 3657, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 928 + ], + "flops": 18560112737920.0 + }, + "timestamp": "2025-09-04 04:18:25.966783", + "step": 3657, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001188501249998808, + "timestamp": "2025-09-04 04:18:25.992867", + "step": 3658, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:26.095363", + "step": 3658, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005962354131042957, + "timestamp": "2025-09-04 04:18:26.114338", + "step": 3659, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:18:26.201582", + "step": 3659, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010241111740469933, + "timestamp": "2025-09-04 04:18:26.218028", + "step": 3660, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:34.655558", + "step": 3660, + "epoch": 3 + }, + { + "type": "pplx", + "content": 260.65734102692426, + "timestamp": "2025-09-04 04:18:34.657505", + "step": 3660, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:34.756784", + "step": 3660, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0069060372188687325, + "timestamp": "2025-09-04 04:18:34.778088", + "step": 3661, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:18:34.856779", + "step": 3661, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014873266918584704, + "timestamp": "2025-09-04 04:18:34.871044", + "step": 3662, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:18:34.948854", + "step": 3662, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005059961229562759, + "timestamp": "2025-09-04 04:18:34.963083", + "step": 3663, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:18:35.081870", + "step": 3663, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00022246531443670392, + "timestamp": "2025-09-04 04:18:35.103259", + "step": 3664, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:18:35.206724", + "step": 3664, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005162604618817568, + "timestamp": "2025-09-04 04:18:35.222293", + "step": 3665, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:35.326289", + "step": 3665, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015102955512702465, + "timestamp": "2025-09-04 04:18:35.345704", + "step": 3666, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:18:35.452901", + "step": 3666, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005610152729786932, + "timestamp": "2025-09-04 04:18:35.472972", + "step": 3667, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:35.578242", + "step": 3667, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002261679619550705, + "timestamp": "2025-09-04 04:18:35.598405", + "step": 3668, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:18:35.706464", + "step": 3668, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009873145259916782, + "timestamp": "2025-09-04 04:18:35.728526", + "step": 3669, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:35.822179", + "step": 3669, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006896741688251495, + "timestamp": "2025-09-04 04:18:35.839074", + "step": 3670, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:35.933182", + "step": 3670, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011464201379567385, + "timestamp": "2025-09-04 04:18:35.950685", + "step": 3671, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:36.044556", + "step": 3671, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001933246268890798, + "timestamp": "2025-09-04 04:18:36.062058", + "step": 3672, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:18:36.153756", + "step": 3672, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005047465208917856, + "timestamp": "2025-09-04 04:18:36.173094", + "step": 3673, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:18:36.264431", + "step": 3673, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01326705701649189, + "timestamp": "2025-09-04 04:18:36.279731", + "step": 3674, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:36.390540", + "step": 3674, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001728372648358345, + "timestamp": "2025-09-04 04:18:36.409184", + "step": 3675, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:36.511047", + "step": 3675, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00010324228787794709, + "timestamp": "2025-09-04 04:18:36.531104", + "step": 3676, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:36.631106", + "step": 3676, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006894250982441008, + "timestamp": "2025-09-04 04:18:36.651977", + "step": 3677, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:18:36.737949", + "step": 3677, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0178501196205616, + "timestamp": "2025-09-04 04:18:36.753638", + "step": 3678, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:36.847722", + "step": 3678, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004994639195501804, + "timestamp": "2025-09-04 04:18:36.865037", + "step": 3679, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:36.970830", + "step": 3679, + "epoch": 3 + }, + { + "type": "loss", + "content": 9.363189019495621e-05, + "timestamp": "2025-09-04 04:18:36.991002", + "step": 3680, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:45.564502", + "step": 3680, + "epoch": 3 + }, + { + "type": "pplx", + "content": 269.183933665212, + "timestamp": "2025-09-04 04:18:45.567064", + "step": 3680, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3680", + "timestamp": "2025-09-04 04:18:45.962752", + "step": 3680, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:46.060324", + "step": 3680, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016621418297290802, + "timestamp": "2025-09-04 04:18:46.080876", + "step": 3681, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:18:46.188302", + "step": 3681, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010536265559494495, + "timestamp": "2025-09-04 04:18:46.208241", + "step": 3682, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:46.313573", + "step": 3682, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010933700948953629, + "timestamp": "2025-09-04 04:18:46.332960", + "step": 3683, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1424 + ], + "flops": 28480172958272.0 + }, + "timestamp": "2025-09-04 04:18:46.545657", + "step": 3683, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006731206551194191, + "timestamp": "2025-09-04 04:18:46.587241", + "step": 3684, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:46.686304", + "step": 3684, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017399858683347702, + "timestamp": "2025-09-04 04:18:46.706662", + "step": 3685, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:46.797807", + "step": 3685, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017027389258146286, + "timestamp": "2025-09-04 04:18:46.814735", + "step": 3686, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:46.909873", + "step": 3686, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014917801599949598, + "timestamp": "2025-09-04 04:18:46.927291", + "step": 3687, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:47.032604", + "step": 3687, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007048693019896746, + "timestamp": "2025-09-04 04:18:47.052648", + "step": 3688, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:18:47.134919", + "step": 3688, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.023266077041625977, + "timestamp": "2025-09-04 04:18:47.151532", + "step": 3689, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:18:47.260045", + "step": 3689, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005440382286906242, + "timestamp": "2025-09-04 04:18:47.280442", + "step": 3690, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:47.392196", + "step": 3690, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005371682345867157, + "timestamp": "2025-09-04 04:18:47.411099", + "step": 3691, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:18:47.506234", + "step": 3691, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03604043647646904, + "timestamp": "2025-09-04 04:18:47.524627", + "step": 3692, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:18:47.598228", + "step": 3692, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016086634248495102, + "timestamp": "2025-09-04 04:18:47.613044", + "step": 3693, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:18:47.712403", + "step": 3693, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003654019208624959, + "timestamp": "2025-09-04 04:18:47.731016", + "step": 3694, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:18:47.849362", + "step": 3694, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0030856921803206205, + "timestamp": "2025-09-04 04:18:47.871448", + "step": 3695, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:47.976083", + "step": 3695, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017932873452082276, + "timestamp": "2025-09-04 04:18:47.996135", + "step": 3696, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:18:48.139248", + "step": 3696, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021279409993439913, + "timestamp": "2025-09-04 04:18:48.170357", + "step": 3697, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:18:48.278683", + "step": 3697, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009123628260567784, + "timestamp": "2025-09-04 04:18:48.299080", + "step": 3698, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:18:48.395269", + "step": 3698, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00036065455060452223, + "timestamp": "2025-09-04 04:18:48.412856", + "step": 3699, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:18:48.507808", + "step": 3699, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.019889622926712036, + "timestamp": "2025-09-04 04:18:48.525968", + "step": 3700, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:18:57.005155", + "step": 3700, + "epoch": 3 + }, + { + "type": "pplx", + "content": 276.00496979251966, + "timestamp": "2025-09-04 04:18:57.008172", + "step": 3700, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:57.106990", + "step": 3700, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03130375221371651, + "timestamp": "2025-09-04 04:18:57.127767", + "step": 3701, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:57.230808", + "step": 3701, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.054532185196876526, + "timestamp": "2025-09-04 04:18:57.249702", + "step": 3702, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:18:57.325769", + "step": 3702, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008000990375876427, + "timestamp": "2025-09-04 04:18:57.339443", + "step": 3703, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:18:57.430429", + "step": 3703, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00035263263271190226, + "timestamp": "2025-09-04 04:18:57.448000", + "step": 3704, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:18:57.554283", + "step": 3704, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003820637473836541, + "timestamp": "2025-09-04 04:18:57.576978", + "step": 3705, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:18:57.672220", + "step": 3705, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006798545364290476, + "timestamp": "2025-09-04 04:18:57.689846", + "step": 3706, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:18:57.793776", + "step": 3706, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006438225973397493, + "timestamp": "2025-09-04 04:18:57.813047", + "step": 3707, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:18:57.889419", + "step": 3707, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00427033007144928, + "timestamp": "2025-09-04 04:18:57.903748", + "step": 3708, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:18:57.994336", + "step": 3708, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027048927731812, + "timestamp": "2025-09-04 04:18:58.013185", + "step": 3709, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:18:58.121817", + "step": 3709, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028957033529877663, + "timestamp": "2025-09-04 04:18:58.140618", + "step": 3710, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:58.242805", + "step": 3710, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006207419210113585, + "timestamp": "2025-09-04 04:18:58.261928", + "step": 3711, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:18:58.372427", + "step": 3711, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011274183169007301, + "timestamp": "2025-09-04 04:18:58.393827", + "step": 3712, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:18:58.474801", + "step": 3712, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001740701962262392, + "timestamp": "2025-09-04 04:18:58.491381", + "step": 3713, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1184 + ], + "flops": 23680143819392.0 + }, + "timestamp": "2025-09-04 04:18:58.662835", + "step": 3713, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008133734576404095, + "timestamp": "2025-09-04 04:18:58.697505", + "step": 3714, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:58.799806", + "step": 3714, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011398537084460258, + "timestamp": "2025-09-04 04:18:58.819083", + "step": 3715, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:58.922406", + "step": 3715, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006214221939444542, + "timestamp": "2025-09-04 04:18:58.942354", + "step": 3716, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:18:59.044963", + "step": 3716, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00310851470567286, + "timestamp": "2025-09-04 04:18:59.066118", + "step": 3717, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:18:59.142839", + "step": 3717, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003634780179709196, + "timestamp": "2025-09-04 04:18:59.156277", + "step": 3718, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:18:59.252361", + "step": 3718, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004398180462885648, + "timestamp": "2025-09-04 04:18:59.270038", + "step": 3719, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:18:59.357108", + "step": 3719, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0019110249122604728, + "timestamp": "2025-09-04 04:18:59.373500", + "step": 3720, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:19:07.964079", + "step": 3720, + "epoch": 3 + }, + { + "type": "pplx", + "content": 281.6937485671799, + "timestamp": "2025-09-04 04:19:07.966081", + "step": 3720, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3720", + "timestamp": "2025-09-04 04:19:08.325947", + "step": 3720, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:19:08.399210", + "step": 3720, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012652603909373283, + "timestamp": "2025-09-04 04:19:08.414237", + "step": 3721, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:19:08.508172", + "step": 3721, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007865807972848415, + "timestamp": "2025-09-04 04:19:08.525527", + "step": 3722, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:08.628679", + "step": 3722, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03159695491194725, + "timestamp": "2025-09-04 04:19:08.647870", + "step": 3723, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:08.735900", + "step": 3723, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004598596598953009, + "timestamp": "2025-09-04 04:19:08.752357", + "step": 3724, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1232 + ], + "flops": 24640149647168.0 + }, + "timestamp": "2025-09-04 04:19:08.931081", + "step": 3724, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013268624432384968, + "timestamp": "2025-09-04 04:19:08.968763", + "step": 3725, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:09.054834", + "step": 3725, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011762577341869473, + "timestamp": "2025-09-04 04:19:09.070399", + "step": 3726, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:09.173921", + "step": 3726, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04035170376300812, + "timestamp": "2025-09-04 04:19:09.193294", + "step": 3727, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:09.292993", + "step": 3727, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.000641504826489836, + "timestamp": "2025-09-04 04:19:09.312305", + "step": 3728, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 944 + ], + "flops": 18880114680512.0 + }, + "timestamp": "2025-09-04 04:19:09.445418", + "step": 3728, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00042860963731072843, + "timestamp": "2025-09-04 04:19:09.474369", + "step": 3729, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:09.561153", + "step": 3729, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01674291118979454, + "timestamp": "2025-09-04 04:19:09.576784", + "step": 3730, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:19:09.687305", + "step": 3730, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04544749855995178, + "timestamp": "2025-09-04 04:19:09.707970", + "step": 3731, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:19:09.809977", + "step": 3731, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015480854781344533, + "timestamp": "2025-09-04 04:19:09.830021", + "step": 3732, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:19:09.918419", + "step": 3732, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00010000986367231235, + "timestamp": "2025-09-04 04:19:09.936774", + "step": 3733, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:10.037412", + "step": 3733, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.014707312919199467, + "timestamp": "2025-09-04 04:19:10.056314", + "step": 3734, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:19:10.139093", + "step": 3734, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0023547126911580563, + "timestamp": "2025-09-04 04:19:10.153214", + "step": 3735, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:10.238659", + "step": 3735, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005836615804582834, + "timestamp": "2025-09-04 04:19:10.254980", + "step": 3736, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:10.353590", + "step": 3736, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00020389581914059818, + "timestamp": "2025-09-04 04:19:10.374400", + "step": 3737, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:10.478398", + "step": 3737, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003900925803463906, + "timestamp": "2025-09-04 04:19:10.497684", + "step": 3738, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:10.596881", + "step": 3738, + "epoch": 3 + }, + { + "type": "loss", + "content": 6.265490083023906e-05, + "timestamp": "2025-09-04 04:19:10.615552", + "step": 3739, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 960 + ], + "flops": 19200116623104.0 + }, + "timestamp": "2025-09-04 04:19:10.753104", + "step": 3739, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04083415865898132, + "timestamp": "2025-09-04 04:19:10.780282", + "step": 3740, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:19:19.338127", + "step": 3740, + "epoch": 3 + }, + { + "type": "pplx", + "content": 285.6040904345629, + "timestamp": "2025-09-04 04:19:19.340007", + "step": 3740, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:19:19.457232", + "step": 3740, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010513699380680919, + "timestamp": "2025-09-04 04:19:19.482682", + "step": 3741, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1376 + ], + "flops": 27520167130496.0 + }, + "timestamp": "2025-09-04 04:19:19.687690", + "step": 3741, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0182512030005455, + "timestamp": "2025-09-04 04:19:19.726797", + "step": 3742, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:19.838392", + "step": 3742, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008120352867990732, + "timestamp": "2025-09-04 04:19:19.859139", + "step": 3743, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:19:19.958398", + "step": 3743, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015680500073358417, + "timestamp": "2025-09-04 04:19:19.976612", + "step": 3744, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:19:20.080286", + "step": 3744, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007197872269898653, + "timestamp": "2025-09-04 04:19:20.099097", + "step": 3745, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:19:20.227801", + "step": 3745, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011416764929890633, + "timestamp": "2025-09-04 04:19:20.248322", + "step": 3746, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:20.402738", + "step": 3746, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004257888998836279, + "timestamp": "2025-09-04 04:19:20.423364", + "step": 3747, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:20.563747", + "step": 3747, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004272022750228643, + "timestamp": "2025-09-04 04:19:20.583768", + "step": 3748, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:19:20.696830", + "step": 3748, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0040796962566673756, + "timestamp": "2025-09-04 04:19:20.716140", + "step": 3749, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:20.816046", + "step": 3749, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027478632982820272, + "timestamp": "2025-09-04 04:19:20.835598", + "step": 3750, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:19:20.984592", + "step": 3750, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0048896921798586845, + "timestamp": "2025-09-04 04:19:21.003477", + "step": 3751, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:21.136153", + "step": 3751, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00022235576761886477, + "timestamp": "2025-09-04 04:19:21.152425", + "step": 3752, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:19:21.253033", + "step": 3752, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009761957451701164, + "timestamp": "2025-09-04 04:19:21.272287", + "step": 3753, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:21.379524", + "step": 3753, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006642633117735386, + "timestamp": "2025-09-04 04:19:21.398711", + "step": 3754, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:21.513529", + "step": 3754, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0035070693120360374, + "timestamp": "2025-09-04 04:19:21.534261", + "step": 3755, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:21.639442", + "step": 3755, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04340730234980583, + "timestamp": "2025-09-04 04:19:21.659626", + "step": 3756, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:21.754047", + "step": 3756, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.02027786523103714, + "timestamp": "2025-09-04 04:19:21.771006", + "step": 3757, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 832 + ], + "flops": 16640101082368.0 + }, + "timestamp": "2025-09-04 04:19:21.906935", + "step": 3757, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.029796713963150978, + "timestamp": "2025-09-04 04:19:21.930003", + "step": 3758, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:22.044773", + "step": 3758, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003318408038467169, + "timestamp": "2025-09-04 04:19:22.065227", + "step": 3759, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:22.205933", + "step": 3759, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00017196725821122527, + "timestamp": "2025-09-04 04:19:22.228658", + "step": 3760, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:19:31.072582", + "step": 3760, + "epoch": 3 + }, + { + "type": "pplx", + "content": 286.6894182494486, + "timestamp": "2025-09-04 04:19:31.075091", + "step": 3760, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3760", + "timestamp": "2025-09-04 04:19:31.569145", + "step": 3760, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:19:31.696385", + "step": 3760, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.031767360866069794, + "timestamp": "2025-09-04 04:19:31.718702", + "step": 3761, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:19:31.824881", + "step": 3761, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008120999671518803, + "timestamp": "2025-09-04 04:19:31.843846", + "step": 3762, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:19:31.923836", + "step": 3762, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04948921501636505, + "timestamp": "2025-09-04 04:19:31.937734", + "step": 3763, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:32.043823", + "step": 3763, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0013902034843340516, + "timestamp": "2025-09-04 04:19:32.063625", + "step": 3764, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:32.162247", + "step": 3764, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0031580275390297174, + "timestamp": "2025-09-04 04:19:32.182508", + "step": 3765, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:19:32.277902", + "step": 3765, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007031270652078092, + "timestamp": "2025-09-04 04:19:32.295167", + "step": 3766, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:19:32.446073", + "step": 3766, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006634030491113663, + "timestamp": "2025-09-04 04:19:32.474137", + "step": 3767, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:19:32.546675", + "step": 3767, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001953285885974765, + "timestamp": "2025-09-04 04:19:32.559867", + "step": 3768, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:32.663220", + "step": 3768, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0026485491544008255, + "timestamp": "2025-09-04 04:19:32.684306", + "step": 3769, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:19:32.796110", + "step": 3769, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007451950805261731, + "timestamp": "2025-09-04 04:19:32.816335", + "step": 3770, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1120 + ], + "flops": 22400136049024.0 + }, + "timestamp": "2025-09-04 04:19:32.980248", + "step": 3770, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00019487171084620059, + "timestamp": "2025-09-04 04:19:33.012027", + "step": 3771, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:19:33.098017", + "step": 3771, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0070730033330619335, + "timestamp": "2025-09-04 04:19:33.113870", + "step": 3772, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:33.217274", + "step": 3772, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0077186161652207375, + "timestamp": "2025-09-04 04:19:33.238178", + "step": 3773, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 416 + ], + "flops": 8320050574976.0 + }, + "timestamp": "2025-09-04 04:19:33.310724", + "step": 3773, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003549723420292139, + "timestamp": "2025-09-04 04:19:33.323108", + "step": 3774, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:33.436045", + "step": 3774, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026352064684033394, + "timestamp": "2025-09-04 04:19:33.456603", + "step": 3775, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:33.544446", + "step": 3775, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007274407893419266, + "timestamp": "2025-09-04 04:19:33.560495", + "step": 3776, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:19:33.663022", + "step": 3776, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005042992066591978, + "timestamp": "2025-09-04 04:19:33.683851", + "step": 3777, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:33.785311", + "step": 3777, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00039034750079736114, + "timestamp": "2025-09-04 04:19:33.803786", + "step": 3778, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:19:33.883644", + "step": 3778, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00030172173865139484, + "timestamp": "2025-09-04 04:19:33.897563", + "step": 3779, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:19:34.007601", + "step": 3779, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.017556700855493546, + "timestamp": "2025-09-04 04:19:34.028541", + "step": 3780, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:19:42.961188", + "step": 3780, + "epoch": 3 + }, + { + "type": "pplx", + "content": 286.5459421244075, + "timestamp": "2025-09-04 04:19:42.963267", + "step": 3780, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:43.045692", + "step": 3780, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.012321644462645054, + "timestamp": "2025-09-04 04:19:43.062957", + "step": 3781, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:19:43.157605", + "step": 3781, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002354747848585248, + "timestamp": "2025-09-04 04:19:43.175092", + "step": 3782, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:19:43.282378", + "step": 3782, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026513388380408287, + "timestamp": "2025-09-04 04:19:43.302750", + "step": 3783, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:19:43.378659", + "step": 3783, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003994217608124018, + "timestamp": "2025-09-04 04:19:43.393283", + "step": 3784, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:19:43.496515", + "step": 3784, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001885236008092761, + "timestamp": "2025-09-04 04:19:43.518555", + "step": 3785, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:43.630362", + "step": 3785, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011333000846207142, + "timestamp": "2025-09-04 04:19:43.651043", + "step": 3786, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:19:43.744476", + "step": 3786, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012821113923564553, + "timestamp": "2025-09-04 04:19:43.761698", + "step": 3787, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:43.863342", + "step": 3787, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0025108223780989647, + "timestamp": "2025-09-04 04:19:43.882925", + "step": 3788, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1088 + ], + "flops": 21760132163840.0 + }, + "timestamp": "2025-09-04 04:19:44.035332", + "step": 3788, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.016976939514279366, + "timestamp": "2025-09-04 04:19:44.069075", + "step": 3789, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:44.176089", + "step": 3789, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007931654341518879, + "timestamp": "2025-09-04 04:19:44.195309", + "step": 3790, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:19:44.291098", + "step": 3790, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005856201983988285, + "timestamp": "2025-09-04 04:19:44.308591", + "step": 3791, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:19:44.417595", + "step": 3791, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0056821079924702644, + "timestamp": "2025-09-04 04:19:44.438676", + "step": 3792, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:19:44.523040", + "step": 3792, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00843026302754879, + "timestamp": "2025-09-04 04:19:44.540136", + "step": 3793, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:44.633107", + "step": 3793, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009367075748741627, + "timestamp": "2025-09-04 04:19:44.648653", + "step": 3794, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:44.749854", + "step": 3794, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01958874613046646, + "timestamp": "2025-09-04 04:19:44.768794", + "step": 3795, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:19:44.863243", + "step": 3795, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003997324500232935, + "timestamp": "2025-09-04 04:19:44.881646", + "step": 3796, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:44.986383", + "step": 3796, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00712942611426115, + "timestamp": "2025-09-04 04:19:45.007700", + "step": 3797, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 800 + ], + "flops": 16000097197184.0 + }, + "timestamp": "2025-09-04 04:19:45.127722", + "step": 3797, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004085378255695105, + "timestamp": "2025-09-04 04:19:45.149603", + "step": 3798, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:19:45.247231", + "step": 3798, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0033143635373562574, + "timestamp": "2025-09-04 04:19:45.264092", + "step": 3799, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:19:45.363499", + "step": 3799, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01740036904811859, + "timestamp": "2025-09-04 04:19:45.381789", + "step": 3800, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:19:53.897411", + "step": 3800, + "epoch": 3 + }, + { + "type": "pplx", + "content": 287.8095566615489, + "timestamp": "2025-09-04 04:19:53.899499", + "step": 3800, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3800", + "timestamp": "2025-09-04 04:19:54.373165", + "step": 3800, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 384 + ], + "flops": 7680046689792.0 + }, + "timestamp": "2025-09-04 04:19:54.434055", + "step": 3800, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004595032427459955, + "timestamp": "2025-09-04 04:19:54.446095", + "step": 3801, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:19:54.529226", + "step": 3801, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005345925688743591, + "timestamp": "2025-09-04 04:19:54.544393", + "step": 3802, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 896 + ], + "flops": 17920108852736.0 + }, + "timestamp": "2025-09-04 04:19:54.673012", + "step": 3802, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0032179898116737604, + "timestamp": "2025-09-04 04:19:54.697760", + "step": 3803, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:54.784993", + "step": 3803, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011905638501048088, + "timestamp": "2025-09-04 04:19:54.801510", + "step": 3804, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:54.909779", + "step": 3804, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00038323350599966943, + "timestamp": "2025-09-04 04:19:54.932539", + "step": 3805, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:19:55.042815", + "step": 3805, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010173635557293892, + "timestamp": "2025-09-04 04:19:55.063495", + "step": 3806, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:55.167220", + "step": 3806, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010474266484379768, + "timestamp": "2025-09-04 04:19:55.186635", + "step": 3807, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:55.289309", + "step": 3807, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.006612957455217838, + "timestamp": "2025-09-04 04:19:55.309016", + "step": 3808, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:19:55.385129", + "step": 3808, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005502256099134684, + "timestamp": "2025-09-04 04:19:55.400714", + "step": 3809, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:19:55.519741", + "step": 3809, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005113132647238672, + "timestamp": "2025-09-04 04:19:55.540420", + "step": 3810, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:19:55.643172", + "step": 3810, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004534454550594091, + "timestamp": "2025-09-04 04:19:55.662408", + "step": 3811, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:55.766556", + "step": 3811, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0004610498435795307, + "timestamp": "2025-09-04 04:19:55.786680", + "step": 3812, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:19:55.874100", + "step": 3812, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01628238335251808, + "timestamp": "2025-09-04 04:19:55.892557", + "step": 3813, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:55.996550", + "step": 3813, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0012509598163887858, + "timestamp": "2025-09-04 04:19:56.015759", + "step": 3814, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:56.116208", + "step": 3814, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002474145032465458, + "timestamp": "2025-09-04 04:19:56.135188", + "step": 3815, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:19:56.239734", + "step": 3815, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014434803742915392, + "timestamp": "2025-09-04 04:19:56.259899", + "step": 3816, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 816 + ], + "flops": 16320099139776.0 + }, + "timestamp": "2025-09-04 04:19:56.379011", + "step": 3816, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014426189009100199, + "timestamp": "2025-09-04 04:19:56.404527", + "step": 3817, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 576 + ], + "flops": 11520070000896.0 + }, + "timestamp": "2025-09-04 04:19:56.491449", + "step": 3817, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005996841937303543, + "timestamp": "2025-09-04 04:19:56.507188", + "step": 3818, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:19:56.606655", + "step": 3818, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0009320307872258127, + "timestamp": "2025-09-04 04:19:56.625375", + "step": 3819, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:19:56.737655", + "step": 3819, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.028978558257222176, + "timestamp": "2025-09-04 04:19:56.757294", + "step": 3820, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:05.260868", + "step": 3820, + "epoch": 3 + }, + { + "type": "pplx", + "content": 290.2094912937652, + "timestamp": "2025-09-04 04:20:05.263045", + "step": 3820, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:05.362272", + "step": 3820, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0015448291087523103, + "timestamp": "2025-09-04 04:20:05.383584", + "step": 3821, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:20:05.479527", + "step": 3821, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013662063516676426, + "timestamp": "2025-09-04 04:20:05.497092", + "step": 3822, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:05.601600", + "step": 3822, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003193710930645466, + "timestamp": "2025-09-04 04:20:05.620767", + "step": 3823, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:20:05.722416", + "step": 3823, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.026605522260069847, + "timestamp": "2025-09-04 04:20:05.741885", + "step": 3824, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:20:05.840824", + "step": 3824, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0024066600017249584, + "timestamp": "2025-09-04 04:20:05.861635", + "step": 3825, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:05.965300", + "step": 3825, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0002010101597988978, + "timestamp": "2025-09-04 04:20:05.984532", + "step": 3826, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:20:06.093413", + "step": 3826, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.01071922481060028, + "timestamp": "2025-09-04 04:20:06.113643", + "step": 3827, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:20:06.231680", + "step": 3827, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00014394304889719933, + "timestamp": "2025-09-04 04:20:06.254710", + "step": 3828, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:20:06.346407", + "step": 3828, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001017340342514217, + "timestamp": "2025-09-04 04:20:06.365595", + "step": 3829, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 512 + ], + "flops": 10240062230528.0 + }, + "timestamp": "2025-09-04 04:20:06.443920", + "step": 3829, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00646333210170269, + "timestamp": "2025-09-04 04:20:06.458192", + "step": 3830, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:20:06.564966", + "step": 3830, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.002317563397809863, + "timestamp": "2025-09-04 04:20:06.585040", + "step": 3831, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:06.688740", + "step": 3831, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025038516148924828, + "timestamp": "2025-09-04 04:20:06.708726", + "step": 3832, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:20:06.790921", + "step": 3832, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008068301249295473, + "timestamp": "2025-09-04 04:20:06.807722", + "step": 3833, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 784 + ], + "flops": 15680095254592.0 + }, + "timestamp": "2025-09-04 04:20:06.924763", + "step": 3833, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007588067092001438, + "timestamp": "2025-09-04 04:20:06.946951", + "step": 3834, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:20:07.023668", + "step": 3834, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007368748192675412, + "timestamp": "2025-09-04 04:20:07.037543", + "step": 3835, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:20:07.144531", + "step": 3835, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00043095918954350054, + "timestamp": "2025-09-04 04:20:07.165248", + "step": 3836, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:20:07.273966", + "step": 3836, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004333295859396458, + "timestamp": "2025-09-04 04:20:07.296790", + "step": 3837, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:07.394901", + "step": 3837, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017703445628285408, + "timestamp": "2025-09-04 04:20:07.412129", + "step": 3838, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:20:07.495915", + "step": 3838, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03642124682664871, + "timestamp": "2025-09-04 04:20:07.511279", + "step": 3839, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:07.604411", + "step": 3839, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015598982572555542, + "timestamp": "2025-09-04 04:20:07.622411", + "step": 3840, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:16.109142", + "step": 3840, + "epoch": 3 + }, + { + "type": "pplx", + "content": 293.44611506915754, + "timestamp": "2025-09-04 04:20:16.111391", + "step": 3840, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3840", + "timestamp": "2025-09-04 04:20:16.624255", + "step": 3840, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:20:16.730224", + "step": 3840, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03652084618806839, + "timestamp": "2025-09-04 04:20:16.752751", + "step": 3841, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:20:16.863261", + "step": 3841, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017639618599787354, + "timestamp": "2025-09-04 04:20:16.883817", + "step": 3842, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:20:16.978707", + "step": 3842, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011736215092241764, + "timestamp": "2025-09-04 04:20:16.996332", + "step": 3843, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 432 + ], + "flops": 8640052517568.0 + }, + "timestamp": "2025-09-04 04:20:17.068336", + "step": 3843, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006759539246559143, + "timestamp": "2025-09-04 04:20:17.081947", + "step": 3844, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:20:17.165333", + "step": 3844, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001056056353263557, + "timestamp": "2025-09-04 04:20:17.181929", + "step": 3845, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:20:17.282369", + "step": 3845, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00557122053578496, + "timestamp": "2025-09-04 04:20:17.301087", + "step": 3846, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:20:17.402195", + "step": 3846, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010494154877960682, + "timestamp": "2025-09-04 04:20:17.421155", + "step": 3847, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:17.525683", + "step": 3847, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0003855594841297716, + "timestamp": "2025-09-04 04:20:17.545741", + "step": 3848, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:20:17.643547", + "step": 3848, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0037584335077553988, + "timestamp": "2025-09-04 04:20:17.664083", + "step": 3849, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:20:17.772097", + "step": 3849, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008188803680241108, + "timestamp": "2025-09-04 04:20:17.792209", + "step": 3850, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:17.898084", + "step": 3850, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0006697289645671844, + "timestamp": "2025-09-04 04:20:17.917502", + "step": 3851, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:20:17.995980", + "step": 3851, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.025056758895516396, + "timestamp": "2025-09-04 04:20:18.010432", + "step": 3852, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:20:18.106741", + "step": 3852, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005154153797775507, + "timestamp": "2025-09-04 04:20:18.125833", + "step": 3853, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 720 + ], + "flops": 14400087484224.0 + }, + "timestamp": "2025-09-04 04:20:18.232402", + "step": 3853, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009047658182680607, + "timestamp": "2025-09-04 04:20:18.252461", + "step": 3854, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:20:18.339002", + "step": 3854, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010006698779761791, + "timestamp": "2025-09-04 04:20:18.354353", + "step": 3855, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:18.458829", + "step": 3855, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00595315545797348, + "timestamp": "2025-09-04 04:20:18.476860", + "step": 3856, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 592 + ], + "flops": 11840071943488.0 + }, + "timestamp": "2025-09-04 04:20:18.566251", + "step": 3856, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007929496932774782, + "timestamp": "2025-09-04 04:20:18.584632", + "step": 3857, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:20:18.661728", + "step": 3857, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0036627210211008787, + "timestamp": "2025-09-04 04:20:18.675620", + "step": 3858, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:18.778203", + "step": 3858, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0008201138116419315, + "timestamp": "2025-09-04 04:20:18.797517", + "step": 3859, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:18.900813", + "step": 3859, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0017376311589032412, + "timestamp": "2025-09-04 04:20:18.920853", + "step": 3860, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:27.435062", + "step": 3860, + "epoch": 3 + }, + { + "type": "pplx", + "content": 294.3503277315976, + "timestamp": "2025-09-04 04:20:27.437138", + "step": 3860, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:20:27.517188", + "step": 3860, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011900283396244049, + "timestamp": "2025-09-04 04:20:27.533953", + "step": 3861, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:20:27.642933", + "step": 3861, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.015471918508410454, + "timestamp": "2025-09-04 04:20:27.663390", + "step": 3862, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:27.766648", + "step": 3862, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.001793242641724646, + "timestamp": "2025-09-04 04:20:27.785811", + "step": 3863, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:27.879737", + "step": 3863, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00023462541867047548, + "timestamp": "2025-09-04 04:20:27.897669", + "step": 3864, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:20:28.002948", + "step": 3864, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.005305502098053694, + "timestamp": "2025-09-04 04:20:28.025333", + "step": 3865, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:20:28.145124", + "step": 3865, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03417619690299034, + "timestamp": "2025-09-04 04:20:28.165784", + "step": 3866, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 1024 + ], + "flops": 20480124393472.0 + }, + "timestamp": "2025-09-04 04:20:28.313306", + "step": 3866, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007200147025287151, + "timestamp": "2025-09-04 04:20:28.341721", + "step": 3867, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:20:28.443445", + "step": 3867, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.04463130235671997, + "timestamp": "2025-09-04 04:20:28.470677", + "step": 3868, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:28.618239", + "step": 3868, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00033794058253988624, + "timestamp": "2025-09-04 04:20:28.638786", + "step": 3869, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 544 + ], + "flops": 10880066115712.0 + }, + "timestamp": "2025-09-04 04:20:28.741173", + "step": 3869, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.09599971026182175, + "timestamp": "2025-09-04 04:20:28.756389", + "step": 3870, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:28.860971", + "step": 3870, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00989855919033289, + "timestamp": "2025-09-04 04:20:28.880352", + "step": 3871, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 480 + ], + "flops": 9600058345344.0 + }, + "timestamp": "2025-09-04 04:20:28.959284", + "step": 3871, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.020102083683013916, + "timestamp": "2025-09-04 04:20:28.973935", + "step": 3872, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:20:29.058460", + "step": 3872, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.010750843212008476, + "timestamp": "2025-09-04 04:20:29.075537", + "step": 3873, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 736 + ], + "flops": 14720089426816.0 + }, + "timestamp": "2025-09-04 04:20:29.187735", + "step": 3873, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.024111559614539146, + "timestamp": "2025-09-04 04:20:29.208011", + "step": 3874, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:20:29.322373", + "step": 3874, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0005178121500648558, + "timestamp": "2025-09-04 04:20:29.342893", + "step": 3875, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:29.439124", + "step": 3875, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008165943436324596, + "timestamp": "2025-09-04 04:20:29.457030", + "step": 3876, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:20:29.554137", + "step": 3876, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027514963876456022, + "timestamp": "2025-09-04 04:20:29.573166", + "step": 3877, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:29.679149", + "step": 3877, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007004078943282366, + "timestamp": "2025-09-04 04:20:29.698344", + "step": 3878, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 464 + ], + "flops": 9280056402752.0 + }, + "timestamp": "2025-09-04 04:20:29.781681", + "step": 3878, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011216630227863789, + "timestamp": "2025-09-04 04:20:29.794983", + "step": 3879, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:20:29.883567", + "step": 3879, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.008931388147175312, + "timestamp": "2025-09-04 04:20:29.899790", + "step": 3880, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:38.519446", + "step": 3880, + "epoch": 3 + }, + { + "type": "pplx", + "content": 287.92952489903456, + "timestamp": "2025-09-04 04:20:38.522894", + "step": 3880, + "epoch": 3 + }, + { + "type": "info", + "content": "Checkpoint saved at step 3880", + "timestamp": "2025-09-04 04:20:39.031240", + "step": 3880, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 688 + ], + "flops": 13760083599040.0 + }, + "timestamp": "2025-09-04 04:20:39.130503", + "step": 3880, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0010395454009994864, + "timestamp": "2025-09-04 04:20:39.151468", + "step": 3881, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 560 + ], + "flops": 11200068058304.0 + }, + "timestamp": "2025-09-04 04:20:39.237424", + "step": 3881, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004671165719628334, + "timestamp": "2025-09-04 04:20:39.252861", + "step": 3882, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:20:39.348868", + "step": 3882, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.004468918778002262, + "timestamp": "2025-09-04 04:20:39.366431", + "step": 3883, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:39.461021", + "step": 3883, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00038716281414963305, + "timestamp": "2025-09-04 04:20:39.478989", + "step": 3884, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 624 + ], + "flops": 12480075828672.0 + }, + "timestamp": "2025-09-04 04:20:39.580317", + "step": 3884, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0007503409287892282, + "timestamp": "2025-09-04 04:20:39.599412", + "step": 3885, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:20:39.719678", + "step": 3885, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.009064269252121449, + "timestamp": "2025-09-04 04:20:39.738284", + "step": 3886, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 656 + ], + "flops": 13120079713856.0 + }, + "timestamp": "2025-09-04 04:20:39.837794", + "step": 3886, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.00859552901238203, + "timestamp": "2025-09-04 04:20:39.856340", + "step": 3887, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 640 + ], + "flops": 12800077771264.0 + }, + "timestamp": "2025-09-04 04:20:39.952242", + "step": 3887, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007069156970828772, + "timestamp": "2025-09-04 04:20:39.970460", + "step": 3888, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:20:40.080145", + "step": 3888, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.003085685195401311, + "timestamp": "2025-09-04 04:20:40.102925", + "step": 3889, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 528 + ], + "flops": 10560064173120.0 + }, + "timestamp": "2025-09-04 04:20:40.188536", + "step": 3889, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.013397028669714928, + "timestamp": "2025-09-04 04:20:40.203557", + "step": 3890, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 704 + ], + "flops": 14080085541632.0 + }, + "timestamp": "2025-09-04 04:20:40.308276", + "step": 3890, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0029367466922849417, + "timestamp": "2025-09-04 04:20:40.327712", + "step": 3891, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:20:40.430752", + "step": 3891, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0016781740123406053, + "timestamp": "2025-09-04 04:20:40.450543", + "step": 3892, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:40.545165", + "step": 3892, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0011187720810994506, + "timestamp": "2025-09-04 04:20:40.563906", + "step": 3893, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 768 + ], + "flops": 15360093312000.0 + }, + "timestamp": "2025-09-04 04:20:40.673482", + "step": 3893, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.007320099975913763, + "timestamp": "2025-09-04 04:20:40.694144", + "step": 3894, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 672 + ], + "flops": 13440081656448.0 + }, + "timestamp": "2025-09-04 04:20:40.805033", + "step": 3894, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0027296245098114014, + "timestamp": "2025-09-04 04:20:40.824078", + "step": 3895, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 912 + ], + "flops": 18240110795328.0 + }, + "timestamp": "2025-09-04 04:20:40.958781", + "step": 3895, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0014986982569098473, + "timestamp": "2025-09-04 04:20:40.984293", + "step": 3896, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:41.076680", + "step": 3896, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.011014275252819061, + "timestamp": "2025-09-04 04:20:41.095522", + "step": 3897, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 496 + ], + "flops": 9920060287936.0 + }, + "timestamp": "2025-09-04 04:20:41.174621", + "step": 3897, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0021178617607802153, + "timestamp": "2025-09-04 04:20:41.188639", + "step": 3898, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 608 + ], + "flops": 12160073886080.0 + }, + "timestamp": "2025-09-04 04:20:41.283021", + "step": 3898, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.03507016599178314, + "timestamp": "2025-09-04 04:20:41.300244", + "step": 3899, + "epoch": 3 + }, + { + "type": "flops", + "content": { + "type": "train", + "batch_dim": [ + 4, + 752 + ], + "flops": 15040091369408.0 + }, + "timestamp": "2025-09-04 04:20:41.413224", + "step": 3899, + "epoch": 3 + }, + { + "type": "loss", + "content": 0.0063927737064659595, + "timestamp": "2025-09-04 04:20:41.434668", + "step": 3900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:49.925006", + "step": 3900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 280.9776699653499, + "timestamp": "2025-09-04 04:20:49.927474", + "step": 3900, + "epoch": 3 + }, + { + "type": "flops", + "content": [ + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1200 + ], + "batch_size": 8, + "flops": 23953716633984 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1424 + ], + "batch_size": 8, + "flops": 28425077059712 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 528 + ], + "batch_size": 8, + "flops": 10539635356800 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 544 + ], + "batch_size": 8, + "flops": 10859018244352 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 880 + ], + "batch_size": 8, + "flops": 17566058882944 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 784 + ], + "batch_size": 8, + "flops": 15649761557632 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 736 + ], + "batch_size": 8, + "flops": 14691612894976 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 592 + ], + "batch_size": 8, + "flops": 11817166907008 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 576 + ], + "batch_size": 8, + "flops": 11497784019456 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 608 + ], + "batch_size": 8, + "flops": 12136549794560 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 704 + ], + "batch_size": 8, + "flops": 14052847119872 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1344 + ], + "batch_size": 8, + "flops": 26828162621952 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 624 + ], + "batch_size": 8, + "flops": 12455932682112 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 688 + ], + "batch_size": 8, + "flops": 13733464232320 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 1376 + ], + "batch_size": 8, + "flops": 27466928397056 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 720 + ], + "batch_size": 8, + "flops": 14372230007424 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 672 + ], + "batch_size": 8, + "flops": 13414081344768 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 560 + ], + "batch_size": 8, + "flops": 11178401131904 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 640 + ], + "batch_size": 8, + "flops": 12775315569664 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 8, + 656 + ], + "batch_size": 8, + "flops": 13094698457216 + }, + { + "type": "perplexity", + "in_batch_dim": [ + 2, + 384 + ], + "batch_size": 8, + "flops": 7665189368832 + } + ], + "timestamp": "2025-09-04 04:20:58.637186", + "step": 3900, + "epoch": 3 + }, + { + "type": "pplx", + "content": 280.9776699653499, + "timestamp": "2025-09-04 04:20:58.649417", + "step": 3900, + "epoch": 3 + }, + { + "type": "best_pplx", + "content": 255.24736725474492, + "timestamp": "2025-09-04 04:20:58.657551", + "step": 3900, + "epoch": 3 + }, + { + "type": "best_step", + "content": 3640, + "timestamp": "2025-09-04 04:20:58.671847", + "step": 3900, + "epoch": 3 + }, + { + "type": "total_pplx_flops", + "content": 224996302651284480, + "timestamp": "2025-09-04 04:20:58.684121", + "step": 3900, + "epoch": 3 + }, + { + "type": "total_train_flops", + "content": 5.201887604744794e+16, + "timestamp": "2025-09-04 04:20:59.236260", + "step": 3900, + "epoch": 3 + } + ], + "best_evals": { + "pplx": { + "score": 255.24736725474492, + "step": 3640 + }, + "rouge1": { + "precision": 0.8382830502830503, + "recall": 0.827133089133089, + "fmeasure": 0.8242266793036024 + } + } +} \ No newline at end of file