{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_mrpc_ff_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_mrpc_ff_v1/runs/Sep15_03-18-49_gx08", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 57, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters/nlu_mrpc_ff_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": null, "flops": { "eval": 5014951860256000, "train": 10640863719936576, "total": 15655815580192576 }, "total_energy": 12.38765, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:18:56.278210", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 226674977.87649825, "timestamp": "2025-09-15 03:18:56.280551", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:56.348646", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.6993563771247864, "timestamp": "2025-09-15 03:18:56.351544", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.406233", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.6954641342163086, "timestamp": "2025-09-15 03:18:56.408321", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.438403", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.7254253029823303, "timestamp": "2025-09-15 03:18:56.440509", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.469959", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.7056644558906555, "timestamp": "2025-09-15 03:18:56.542440", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.574764", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.11781269311904907, "timestamp": "2025-09-15 03:18:56.576792", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.607459", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.12128707021474838, "timestamp": "2025-09-15 03:18:56.609387", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:56.638998", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.12236928939819336, "timestamp": "2025-09-15 03:18:56.641222", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.671055", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.13365435600280762, "timestamp": "2025-09-15 03:18:56.694617", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.725056", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.006437270902097225, "timestamp": "2025-09-15 03:18:56.727266", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.756541", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.04929186776280403, "timestamp": "2025-09-15 03:18:56.758751", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.788831", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.024710940197110176, "timestamp": "2025-09-15 03:18:56.790973", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.820971", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.007417821791023016, "timestamp": "2025-09-15 03:18:56.844554", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.875236", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.021830081939697266, "timestamp": "2025-09-15 03:18:56.877306", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.906637", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.035914890468120575, "timestamp": "2025-09-15 03:18:56.908599", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.938129", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.02858211100101471, "timestamp": "2025-09-15 03:18:56.940300", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:56.970164", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.021775051951408386, "timestamp": "2025-09-15 03:18:56.993512", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.023220", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.052091244608163834, "timestamp": "2025-09-15 03:18:57.025256", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.056868", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.04736480861902237, "timestamp": "2025-09-15 03:18:57.059994", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.089801", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.03423725813627243, "timestamp": "2025-09-15 03:18:57.092033", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.121772", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.0331081748008728, "timestamp": "2025-09-15 03:18:57.145160", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.176424", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.03887788578867912, "timestamp": "2025-09-15 03:18:57.178482", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.208450", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.042407404631376266, "timestamp": "2025-09-15 03:18:57.210398", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.239886", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.02546323463320732, "timestamp": "2025-09-15 03:18:57.242010", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.271801", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.02281245030462742, "timestamp": "2025-09-15 03:18:57.295299", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.325592", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.024779021739959717, "timestamp": "2025-09-15 03:18:57.327527", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.357749", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.04004126042127609, "timestamp": "2025-09-15 03:18:57.360029", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.390858", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.022673314437270164, "timestamp": "2025-09-15 03:18:57.392930", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.423507", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.026302272453904152, "timestamp": "2025-09-15 03:18:57.447028", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.477613", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.03431862220168114, "timestamp": "2025-09-15 03:18:57.479784", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.509218", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.02185298502445221, "timestamp": "2025-09-15 03:18:57.511438", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.541511", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.022062508389353752, "timestamp": "2025-09-15 03:18:57.543578", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.573772", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.021942496299743652, "timestamp": "2025-09-15 03:18:57.597392", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.627701", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.01732688769698143, "timestamp": "2025-09-15 03:18:57.629783", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.659704", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.031228866428136826, "timestamp": "2025-09-15 03:18:57.661750", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.691609", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.01614983379840851, "timestamp": "2025-09-15 03:18:57.693798", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.723941", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.019677946344017982, "timestamp": "2025-09-15 03:18:57.747440", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.777010", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.021435057744383812, "timestamp": "2025-09-15 03:18:57.779236", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.809337", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.020535405725240707, "timestamp": "2025-09-15 03:18:57.811466", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.841257", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.016418244689702988, "timestamp": "2025-09-15 03:18:57.843311", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.872971", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.017667364329099655, "timestamp": "2025-09-15 03:18:57.896459", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.926342", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.011035345494747162, "timestamp": "2025-09-15 03:18:57.928424", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:57.958599", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.017797647044062614, "timestamp": "2025-09-15 03:18:57.960698", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:57.990571", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.031567107886075974, "timestamp": "2025-09-15 03:18:57.992465", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:18:58.038881", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.011192579753696918, "timestamp": "2025-09-15 03:18:58.062238", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.091817", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.007767003960907459, "timestamp": "2025-09-15 03:18:58.093999", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.123652", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.0076360017992556095, "timestamp": "2025-09-15 03:18:58.125689", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.155072", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.0363575778901577, "timestamp": "2025-09-15 03:18:58.157084", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.186806", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.0191658865660429, "timestamp": "2025-09-15 03:18:58.210638", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.240693", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.004426487721502781, "timestamp": "2025-09-15 03:18:58.242702", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:58.272343", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.03847786784172058, "timestamp": "2025-09-15 03:18:58.275434", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.305274", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.00494822021573782, "timestamp": "2025-09-15 03:18:58.307646", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.337515", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.034384578466415405, "timestamp": "2025-09-15 03:18:58.361238", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.391506", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.03528139740228653, "timestamp": "2025-09-15 03:18:58.393553", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:18:58.423308", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.0411178395152092, "timestamp": "2025-09-15 03:18:58.425449", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:58.456132", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.02395920641720295, "timestamp": "2025-09-15 03:18:58.458396", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:58.488216", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.04303309693932533, "timestamp": "2025-09-15 03:18:58.511806", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:58.542198", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.024037647992372513, "timestamp": "2025-09-15 03:18:58.544207", "step": 57, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:18:59.259705", "step": 57, "epoch": 1 }, { "type": "pplx", "content": 77745310.38449234, "timestamp": "2025-09-15 03:18:59.261544", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.290185", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.021767010912299156, "timestamp": "2025-09-15 03:18:59.292295", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.322380", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.017204873263835907, "timestamp": "2025-09-15 03:18:59.324535", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.354386", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.020821845158934593, "timestamp": "2025-09-15 03:18:59.377913", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.407692", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.018509017303586006, "timestamp": "2025-09-15 03:18:59.409984", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:59.439863", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.024281423538923264, "timestamp": "2025-09-15 03:18:59.442025", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:59.472179", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.008048784919083118, "timestamp": "2025-09-15 03:18:59.474462", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.504248", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.007551565300673246, "timestamp": "2025-09-15 03:18:59.527837", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.557797", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.010596475563943386, "timestamp": "2025-09-15 03:18:59.560082", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:59.589733", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.02982070855796337, "timestamp": "2025-09-15 03:18:59.592639", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.622117", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.012310145422816277, "timestamp": "2025-09-15 03:18:59.624515", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.654600", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.027565743774175644, "timestamp": "2025-09-15 03:18:59.678557", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:59.708673", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.029646441340446472, "timestamp": "2025-09-15 03:18:59.710856", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.740855", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.017770150676369667, "timestamp": "2025-09-15 03:18:59.743111", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.773170", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.013608084991574287, "timestamp": "2025-09-15 03:18:59.775303", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.804981", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.031161179766058922, "timestamp": "2025-09-15 03:18:59.828472", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:18:59.858047", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.019617022946476936, "timestamp": "2025-09-15 03:18:59.860208", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.889161", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.02784857526421547, "timestamp": "2025-09-15 03:18:59.891174", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:18:59.920539", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.016915084794163704, "timestamp": "2025-09-15 03:18:59.922942", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:18:59.952599", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.021168315783143044, "timestamp": "2025-09-15 03:18:59.976206", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.006233", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.019576409831643105, "timestamp": "2025-09-15 03:19:00.008352", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.038249", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.02989226020872593, "timestamp": "2025-09-15 03:19:00.040376", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.070001", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.029687274247407913, "timestamp": "2025-09-15 03:19:00.072294", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:00.101998", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.022338515147566795, "timestamp": "2025-09-15 03:19:00.125621", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.155569", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.02291913703083992, "timestamp": "2025-09-15 03:19:00.157493", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.186461", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.022767139598727226, "timestamp": "2025-09-15 03:19:00.188649", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.218787", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.027573389932513237, "timestamp": "2025-09-15 03:19:00.221168", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.250784", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.02591836452484131, "timestamp": "2025-09-15 03:19:00.274430", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.316999", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.022715887054800987, "timestamp": "2025-09-15 03:19:00.318996", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.348721", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.021058840677142143, "timestamp": "2025-09-15 03:19:00.350847", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.380696", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.017032312229275703, "timestamp": "2025-09-15 03:19:00.382764", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.412499", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.020295526832342148, "timestamp": "2025-09-15 03:19:00.435875", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.465171", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.02098015323281288, "timestamp": "2025-09-15 03:19:00.467078", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:00.498950", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.02113400027155876, "timestamp": "2025-09-15 03:19:00.501326", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.532873", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.022542644292116165, "timestamp": "2025-09-15 03:19:00.534877", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:00.564346", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.025110268965363503, "timestamp": "2025-09-15 03:19:00.587847", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.617394", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.017810607329010963, "timestamp": "2025-09-15 03:19:00.619471", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.649640", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.024205828085541725, "timestamp": "2025-09-15 03:19:00.651663", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.681964", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.019303206354379654, "timestamp": "2025-09-15 03:19:00.683948", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.713315", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.02511030063033104, "timestamp": "2025-09-15 03:19:00.736847", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.766897", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.018995894119143486, "timestamp": "2025-09-15 03:19:00.769058", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.799028", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.022378621622920036, "timestamp": "2025-09-15 03:19:00.801181", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.831567", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.01698601059615612, "timestamp": "2025-09-15 03:19:00.833574", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.863219", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.016753217205405235, "timestamp": "2025-09-15 03:19:00.886611", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.916543", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.01952139474451542, "timestamp": "2025-09-15 03:19:00.918792", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.948979", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.018787041306495667, "timestamp": "2025-09-15 03:19:00.951249", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:00.980867", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.020538141950964928, "timestamp": "2025-09-15 03:19:00.982910", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.011728", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.03242240101099014, "timestamp": "2025-09-15 03:19:01.035422", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.067236", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.03287802264094353, "timestamp": "2025-09-15 03:19:01.069139", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.098994", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.007900225929915905, "timestamp": "2025-09-15 03:19:01.101346", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:01.131902", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.02327314205467701, "timestamp": "2025-09-15 03:19:01.134861", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:01.164783", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.04384846240282059, "timestamp": "2025-09-15 03:19:01.188312", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.217910", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.0458383746445179, "timestamp": "2025-09-15 03:19:01.219966", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:01.259718", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.039285700768232346, "timestamp": "2025-09-15 03:19:01.261751", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.292034", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.007816492579877377, "timestamp": "2025-09-15 03:19:01.293996", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.324128", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.03153624385595322, "timestamp": "2025-09-15 03:19:01.347768", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:01.377623", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.028779255226254463, "timestamp": "2025-09-15 03:19:01.380014", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:01.409598", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.024018559604883194, "timestamp": "2025-09-15 03:19:01.411698", "step": 114, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:02.174304", "step": 114, "epoch": 1 }, { "type": "pplx", "content": 86973788.16578543, "timestamp": "2025-09-15 03:19:02.176690", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.205542", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.008355635218322277, "timestamp": "2025-09-15 03:19:02.208474", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:02.238538", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.032105229794979095, "timestamp": "2025-09-15 03:19:02.262149", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.296821", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.019809599965810776, "timestamp": "2025-09-15 03:19:02.299717", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.331551", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.018696505576372147, "timestamp": "2025-09-15 03:19:02.336465", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.368402", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.016046470031142235, "timestamp": "2025-09-15 03:19:02.370728", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.402126", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.020479975268244743, "timestamp": "2025-09-15 03:19:02.431958", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.461857", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.03392161801457405, "timestamp": "2025-09-15 03:19:02.464100", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:02.493657", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.022408468648791313, "timestamp": "2025-09-15 03:19:02.495858", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:02.525604", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.022227082401514053, "timestamp": "2025-09-15 03:19:02.529820", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.559977", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.01427256129682064, "timestamp": "2025-09-15 03:19:02.583505", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.613343", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.019987378269433975, "timestamp": "2025-09-15 03:19:02.615379", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.644606", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.031713634729385376, "timestamp": "2025-09-15 03:19:02.646783", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:02.676253", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.01686117984354496, "timestamp": "2025-09-15 03:19:02.678652", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.712067", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.02143436297774315, "timestamp": "2025-09-15 03:19:02.735628", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.767720", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.022101953625679016, "timestamp": "2025-09-15 03:19:02.769787", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.800320", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.026244450360536575, "timestamp": "2025-09-15 03:19:02.802437", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.833505", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.02726527489721775, "timestamp": "2025-09-15 03:19:02.835688", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.866492", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.02093103528022766, "timestamp": "2025-09-15 03:19:02.890041", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:02.920009", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.016711866483092308, "timestamp": "2025-09-15 03:19:02.922125", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:02.952320", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.02590050920844078, "timestamp": "2025-09-15 03:19:02.954478", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:02.983972", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.022663580253720284, "timestamp": "2025-09-15 03:19:02.986239", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.016447", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.020828474313020706, "timestamp": "2025-09-15 03:19:03.040014", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:03.069773", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.02339036390185356, "timestamp": "2025-09-15 03:19:03.072113", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.102165", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.02548803947865963, "timestamp": "2025-09-15 03:19:03.104408", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.134903", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.01995263621211052, "timestamp": "2025-09-15 03:19:03.138022", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.168390", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.024372192099690437, "timestamp": "2025-09-15 03:19:03.192161", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.222149", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.019881976768374443, "timestamp": "2025-09-15 03:19:03.224545", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.254335", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.022397944703698158, "timestamp": "2025-09-15 03:19:03.256524", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.286144", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.024667898193001747, "timestamp": "2025-09-15 03:19:03.288521", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.319233", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.017718922346830368, "timestamp": "2025-09-15 03:19:03.342865", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.373055", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.01860089972615242, "timestamp": "2025-09-15 03:19:03.375029", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.405805", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.015553503297269344, "timestamp": "2025-09-15 03:19:03.408018", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.438209", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.021523339673876762, "timestamp": "2025-09-15 03:19:03.441331", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.471154", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.028602078557014465, "timestamp": "2025-09-15 03:19:03.494609", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.524627", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.03364170342683792, "timestamp": "2025-09-15 03:19:03.526662", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.556699", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.03490597754716873, "timestamp": "2025-09-15 03:19:03.559392", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.589524", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.018360691145062447, "timestamp": "2025-09-15 03:19:03.591671", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.621961", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.013131016865372658, "timestamp": "2025-09-15 03:19:03.645413", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:03.675118", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.014045425690710545, "timestamp": "2025-09-15 03:19:03.677261", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:03.707004", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.02355443872511387, "timestamp": "2025-09-15 03:19:03.709315", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.739404", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.0261994656175375, "timestamp": "2025-09-15 03:19:03.741490", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.772886", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.020213531330227852, "timestamp": "2025-09-15 03:19:03.796337", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.826456", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.038094911724328995, "timestamp": "2025-09-15 03:19:03.828647", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.857927", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.04792099818587303, "timestamp": "2025-09-15 03:19:03.860142", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:03.889704", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.03926343098282814, "timestamp": "2025-09-15 03:19:03.891850", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.921661", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.032103825360536575, "timestamp": "2025-09-15 03:19:03.945243", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:03.975950", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.02103709802031517, "timestamp": "2025-09-15 03:19:03.978127", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.007608", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.029447833076119423, "timestamp": "2025-09-15 03:19:04.009812", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.040241", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.023078102618455887, "timestamp": "2025-09-15 03:19:04.042501", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:04.072505", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.017872294411063194, "timestamp": "2025-09-15 03:19:04.096077", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.125768", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.02360287867486477, "timestamp": "2025-09-15 03:19:04.128019", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:04.157806", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.02339279279112816, "timestamp": "2025-09-15 03:19:04.160109", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.190157", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.02308400347828865, "timestamp": "2025-09-15 03:19:04.192101", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.221876", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.014306237921118736, "timestamp": "2025-09-15 03:19:04.245401", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.275219", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.02043766900897026, "timestamp": "2025-09-15 03:19:04.277425", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:04.307068", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.022694125771522522, "timestamp": "2025-09-15 03:19:04.309160", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:04.339510", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.018936406821012497, "timestamp": "2025-09-15 03:19:04.341812", "step": 171, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:05.057952", "step": 171, "epoch": 1 }, { "type": "pplx", "content": 91425276.58815499, "timestamp": "2025-09-15 03:19:05.061498", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:05.092818", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.026795899495482445, "timestamp": "2025-09-15 03:19:05.116285", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.146417", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.020205063745379448, "timestamp": "2025-09-15 03:19:05.148373", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.177984", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.03856884315609932, "timestamp": "2025-09-15 03:19:05.180286", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.209930", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.013646108098328114, "timestamp": "2025-09-15 03:19:05.212098", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:05.241825", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.021882707253098488, "timestamp": "2025-09-15 03:19:05.265174", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.294597", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.014430088922381401, "timestamp": "2025-09-15 03:19:05.296720", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.329726", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.01406953576952219, "timestamp": "2025-09-15 03:19:05.332206", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.366219", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.011970186606049538, "timestamp": "2025-09-15 03:19:05.368305", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.397902", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.027014048770070076, "timestamp": "2025-09-15 03:19:05.421672", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.452003", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.022636890411376953, "timestamp": "2025-09-15 03:19:05.454310", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.483868", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.027193518355488777, "timestamp": "2025-09-15 03:19:05.485894", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.516030", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.04116543009877205, "timestamp": "2025-09-15 03:19:05.518095", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.547843", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.03132352605462074, "timestamp": "2025-09-15 03:19:05.571327", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:05.601250", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.02800433151423931, "timestamp": "2025-09-15 03:19:05.603294", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.633329", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.021477000787854195, "timestamp": "2025-09-15 03:19:05.635508", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.665347", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.028118086978793144, "timestamp": "2025-09-15 03:19:05.669034", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.698265", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.021604614332318306, "timestamp": "2025-09-15 03:19:05.721675", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.751500", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.01969420723617077, "timestamp": "2025-09-15 03:19:05.753676", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.783885", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.02482091449201107, "timestamp": "2025-09-15 03:19:05.786206", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:05.816527", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.02031696029007435, "timestamp": "2025-09-15 03:19:05.818600", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:05.848360", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.022723600268363953, "timestamp": "2025-09-15 03:19:05.871726", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.901894", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.016467997804284096, "timestamp": "2025-09-15 03:19:05.904127", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.933980", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.015772905200719833, "timestamp": "2025-09-15 03:19:05.936522", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:05.966591", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.019026650115847588, "timestamp": "2025-09-15 03:19:05.968864", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.000085", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.021972985938191414, "timestamp": "2025-09-15 03:19:06.023661", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:06.053380", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.015487863682210445, "timestamp": "2025-09-15 03:19:06.055359", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:06.088407", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.02404738776385784, "timestamp": "2025-09-15 03:19:06.090919", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:06.121158", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.02887124940752983, "timestamp": "2025-09-15 03:19:06.123202", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.153230", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.030241340398788452, "timestamp": "2025-09-15 03:19:06.176720", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:06.206392", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.018082991242408752, "timestamp": "2025-09-15 03:19:06.208464", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.237849", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.019952043890953064, "timestamp": "2025-09-15 03:19:06.239745", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.270129", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.022944288328289986, "timestamp": "2025-09-15 03:19:06.272464", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:06.302044", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.021453650668263435, "timestamp": "2025-09-15 03:19:06.325520", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:06.355312", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.02109239064157009, "timestamp": "2025-09-15 03:19:06.357433", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.387010", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.013666792772710323, "timestamp": "2025-09-15 03:19:06.389078", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.419204", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.028260722756385803, "timestamp": "2025-09-15 03:19:06.421389", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.451056", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.01960393413901329, "timestamp": "2025-09-15 03:19:06.474604", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.504721", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.02854198031127453, "timestamp": "2025-09-15 03:19:06.507021", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.536534", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.03184450417757034, "timestamp": "2025-09-15 03:19:06.539231", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.568389", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.020297560840845108, "timestamp": "2025-09-15 03:19:06.570422", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.600999", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.034596700221300125, "timestamp": "2025-09-15 03:19:06.624533", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.654388", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.030675770714879036, "timestamp": "2025-09-15 03:19:06.656437", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.685852", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.018920252099633217, "timestamp": "2025-09-15 03:19:06.687976", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.718802", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.026095913723111153, "timestamp": "2025-09-15 03:19:06.720864", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.750882", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.024462031200528145, "timestamp": "2025-09-15 03:19:06.774322", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:06.804840", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.01993633806705475, "timestamp": "2025-09-15 03:19:06.806844", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:06.836665", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.022599508985877037, "timestamp": "2025-09-15 03:19:06.838996", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.868600", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.023072870448231697, "timestamp": "2025-09-15 03:19:06.870556", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.899769", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.01495459396392107, "timestamp": "2025-09-15 03:19:06.923135", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.953247", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.020196115598082542, "timestamp": "2025-09-15 03:19:06.955364", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:06.984766", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.025984425097703934, "timestamp": "2025-09-15 03:19:06.987145", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.017052", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.018872182816267014, "timestamp": "2025-09-15 03:19:07.019204", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.048505", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.02921387180685997, "timestamp": "2025-09-15 03:19:07.072120", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.102941", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.016917483881115913, "timestamp": "2025-09-15 03:19:07.105230", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.134819", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.020182834938168526, "timestamp": "2025-09-15 03:19:07.136924", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.166693", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.02879491075873375, "timestamp": "2025-09-15 03:19:07.169146", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:07.198969", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.014776498079299927, "timestamp": "2025-09-15 03:19:07.222511", "step": 228, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:07.932338", "step": 228, "epoch": 1 }, { "type": "pplx", "content": 95328226.03919551, "timestamp": "2025-09-15 03:19:07.934365", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:07.962339", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.01956128515303135, "timestamp": "2025-09-15 03:19:07.964666", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:07.995542", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.02058500424027443, "timestamp": "2025-09-15 03:19:07.998761", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.028450", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.021003449335694313, "timestamp": "2025-09-15 03:19:08.030453", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.060107", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.026520492509007454, "timestamp": "2025-09-15 03:19:08.083483", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:08.113321", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.021533282473683357, "timestamp": "2025-09-15 03:19:08.115783", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.146784", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.021764442324638367, "timestamp": "2025-09-15 03:19:08.148763", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.180901", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.01905788481235504, "timestamp": "2025-09-15 03:19:08.183096", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.212805", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.019037913531064987, "timestamp": "2025-09-15 03:19:08.236329", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:08.266192", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.020615851506590843, "timestamp": "2025-09-15 03:19:08.268294", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.298958", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.02502620592713356, "timestamp": "2025-09-15 03:19:08.302145", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.332058", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.028659885749220848, "timestamp": "2025-09-15 03:19:08.334193", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.363742", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.02334064617753029, "timestamp": "2025-09-15 03:19:08.387132", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:08.417280", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.01678556762635708, "timestamp": "2025-09-15 03:19:08.419374", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.452364", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.01673532836139202, "timestamp": "2025-09-15 03:19:08.454476", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.484480", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.015126006677746773, "timestamp": "2025-09-15 03:19:08.486777", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.516663", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.01618882827460766, "timestamp": "2025-09-15 03:19:08.540110", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.570018", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.0317390151321888, "timestamp": "2025-09-15 03:19:08.572522", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.602221", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.028386985883116722, "timestamp": "2025-09-15 03:19:08.605608", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.635427", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.029143307358026505, "timestamp": "2025-09-15 03:19:08.637834", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.667261", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.021952202543616295, "timestamp": "2025-09-15 03:19:08.690738", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.720309", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.03334729000926018, "timestamp": "2025-09-15 03:19:08.722406", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.752010", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.014858617447316647, "timestamp": "2025-09-15 03:19:08.754124", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.784404", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.02902458980679512, "timestamp": "2025-09-15 03:19:08.786665", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:08.816220", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.01632942259311676, "timestamp": "2025-09-15 03:19:08.839671", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.869985", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.013802406378090382, "timestamp": "2025-09-15 03:19:08.872161", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.901560", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.02780589461326599, "timestamp": "2025-09-15 03:19:08.903841", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:08.933951", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.013642417266964912, "timestamp": "2025-09-15 03:19:08.936850", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:08.966968", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.015592672862112522, "timestamp": "2025-09-15 03:19:08.990658", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.020439", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.01739800162613392, "timestamp": "2025-09-15 03:19:09.022653", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.052113", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.03306451067328453, "timestamp": "2025-09-15 03:19:09.054150", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.083625", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.02215319685637951, "timestamp": "2025-09-15 03:19:09.085969", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:09.116185", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.02427254430949688, "timestamp": "2025-09-15 03:19:09.139671", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.169285", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.0260100606828928, "timestamp": "2025-09-15 03:19:09.171879", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.201415", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.024701233953237534, "timestamp": "2025-09-15 03:19:09.203584", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.233865", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.02077108435332775, "timestamp": "2025-09-15 03:19:09.236049", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.265961", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.01494417805224657, "timestamp": "2025-09-15 03:19:09.289581", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.320083", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.022080078721046448, "timestamp": "2025-09-15 03:19:09.322399", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.352206", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.020438488572835922, "timestamp": "2025-09-15 03:19:09.354393", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:09.384316", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.021945010870695114, "timestamp": "2025-09-15 03:19:09.386423", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.416017", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.03710794076323509, "timestamp": "2025-09-15 03:19:09.439621", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.470007", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.015702739357948303, "timestamp": "2025-09-15 03:19:09.472211", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.501555", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.021318545565009117, "timestamp": "2025-09-15 03:19:09.503655", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.534917", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.023652182891964912, "timestamp": "2025-09-15 03:19:09.536996", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.566378", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.018605994060635567, "timestamp": "2025-09-15 03:19:09.589895", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.619087", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.026582153514027596, "timestamp": "2025-09-15 03:19:09.621135", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:09.650842", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.014253231696784496, "timestamp": "2025-09-15 03:19:09.652951", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.682729", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.019751057028770447, "timestamp": "2025-09-15 03:19:09.684742", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:09.714539", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.02657679282128811, "timestamp": "2025-09-15 03:19:09.738017", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.767917", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.010766022838652134, "timestamp": "2025-09-15 03:19:09.769933", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.799544", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.029790718108415604, "timestamp": "2025-09-15 03:19:09.801652", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:09.831257", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.023627398535609245, "timestamp": "2025-09-15 03:19:09.834136", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:09.864068", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.04631868749856949, "timestamp": "2025-09-15 03:19:09.887689", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.917469", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.017096029594540596, "timestamp": "2025-09-15 03:19:09.919490", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.950359", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.01746373064815998, "timestamp": "2025-09-15 03:19:09.952329", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:09.981577", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.02175072208046913, "timestamp": "2025-09-15 03:19:09.983607", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:10.013521", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.0345226414501667, "timestamp": "2025-09-15 03:19:10.036963", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:10.066817", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.02375132218003273, "timestamp": "2025-09-15 03:19:10.068739", "step": 285, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:11.150475", "step": 285, "epoch": 1 }, { "type": "pplx", "content": 99808598.98644093, "timestamp": "2025-09-15 03:19:11.152465", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.194591", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.0117965592071414, "timestamp": "2025-09-15 03:19:11.196757", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:11.231707", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.018215393647551537, "timestamp": "2025-09-15 03:19:11.234418", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:11.266630", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.03157208487391472, "timestamp": "2025-09-15 03:19:11.290263", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.320396", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.029829951003193855, "timestamp": "2025-09-15 03:19:11.322561", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.352362", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.01235766801983118, "timestamp": "2025-09-15 03:19:11.354458", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.384759", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.021873388439416885, "timestamp": "2025-09-15 03:19:11.387088", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:11.417122", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.02612815983593464, "timestamp": "2025-09-15 03:19:11.440806", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.471321", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.01425966527312994, "timestamp": "2025-09-15 03:19:11.473203", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.516242", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.0279195886105299, "timestamp": "2025-09-15 03:19:11.518210", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.554934", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.018090257421135902, "timestamp": "2025-09-15 03:19:11.556949", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:11.588962", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.014155800454318523, "timestamp": "2025-09-15 03:19:11.612405", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.643174", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.019184812903404236, "timestamp": "2025-09-15 03:19:11.645473", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.678940", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.025441085919737816, "timestamp": "2025-09-15 03:19:11.681103", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:11.710918", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.015621578320860863, "timestamp": "2025-09-15 03:19:11.713193", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:11.743550", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.016921255737543106, "timestamp": "2025-09-15 03:19:11.767940", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.799824", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.023754924535751343, "timestamp": "2025-09-15 03:19:11.801941", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:11.831833", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.013065925799310207, "timestamp": "2025-09-15 03:19:11.834215", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.864436", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.01167286280542612, "timestamp": "2025-09-15 03:19:11.866489", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.895878", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.01580006815493107, "timestamp": "2025-09-15 03:19:11.919994", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.949991", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.0222313292324543, "timestamp": "2025-09-15 03:19:11.952069", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:11.981715", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.028958607465028763, "timestamp": "2025-09-15 03:19:11.983938", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.013152", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.02822468802332878, "timestamp": "2025-09-15 03:19:12.015280", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.044489", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.019177228212356567, "timestamp": "2025-09-15 03:19:12.067927", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.097815", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.00797706376761198, "timestamp": "2025-09-15 03:19:12.100142", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.130005", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.029054300859570503, "timestamp": "2025-09-15 03:19:12.132173", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.161911", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.029977068305015564, "timestamp": "2025-09-15 03:19:12.165788", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.196357", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.01888146437704563, "timestamp": "2025-09-15 03:19:12.220112", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.249610", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.024634139612317085, "timestamp": "2025-09-15 03:19:12.255327", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.288147", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.014435885474085808, "timestamp": "2025-09-15 03:19:12.290509", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.322817", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.028926042839884758, "timestamp": "2025-09-15 03:19:12.324851", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.355228", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.007600546348839998, "timestamp": "2025-09-15 03:19:12.378821", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.409323", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.010601741261780262, "timestamp": "2025-09-15 03:19:12.411396", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:12.441237", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.027423948049545288, "timestamp": "2025-09-15 03:19:12.443004", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.472957", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.015101133845746517, "timestamp": "2025-09-15 03:19:12.475029", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:12.505958", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.020047292113304138, "timestamp": "2025-09-15 03:19:12.529618", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:12.559602", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.018711814656853676, "timestamp": "2025-09-15 03:19:12.561795", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.592324", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.032741669565439224, "timestamp": "2025-09-15 03:19:12.594579", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.624364", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.029987605288624763, "timestamp": "2025-09-15 03:19:12.626550", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:12.656605", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.022992825135588646, "timestamp": "2025-09-15 03:19:12.680198", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:12.710073", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.024438347667455673, "timestamp": "2025-09-15 03:19:12.712122", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.741794", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.024056127294898033, "timestamp": "2025-09-15 03:19:12.744030", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.773363", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.02588737942278385, "timestamp": "2025-09-15 03:19:12.775326", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:12.807193", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.023963138461112976, "timestamp": "2025-09-15 03:19:12.830777", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:12.861014", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.02214493788778782, "timestamp": "2025-09-15 03:19:12.863014", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.893138", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.021407917141914368, "timestamp": "2025-09-15 03:19:12.895126", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:12.925095", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.020377272740006447, "timestamp": "2025-09-15 03:19:12.927226", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:12.957070", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.028564879670739174, "timestamp": "2025-09-15 03:19:12.980628", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:13.010328", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.025727489963173866, "timestamp": "2025-09-15 03:19:13.012523", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:13.042417", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.02494569681584835, "timestamp": "2025-09-15 03:19:13.044487", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:13.074241", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.02045329287648201, "timestamp": "2025-09-15 03:19:13.076519", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:13.107170", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.02221083827316761, "timestamp": "2025-09-15 03:19:13.130954", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:13.161056", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.025785459205508232, "timestamp": "2025-09-15 03:19:13.164065", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:13.193949", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.0212246123701334, "timestamp": "2025-09-15 03:19:13.196340", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:13.225665", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.0205333661288023, "timestamp": "2025-09-15 03:19:13.228179", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:13.258176", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.01782972179353237, "timestamp": "2025-09-15 03:19:13.281652", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:13.312113", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.019433436915278435, "timestamp": "2025-09-15 03:19:13.314493", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:13.344611", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.02971636690199375, "timestamp": "2025-09-15 03:19:13.346966", "step": 342, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:14.057353", "step": 342, "epoch": 1 }, { "type": "pplx", "content": 103611561.54396842, "timestamp": "2025-09-15 03:19:14.059253", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.088526", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.020249316468834877, "timestamp": "2025-09-15 03:19:14.090455", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.122006", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.04564950615167618, "timestamp": "2025-09-15 03:19:14.145521", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.175077", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.024136368185281754, "timestamp": "2025-09-15 03:19:14.177680", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.209004", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.03156773000955582, "timestamp": "2025-09-15 03:19:14.211186", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.241016", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.024225564673542976, "timestamp": "2025-09-15 03:19:14.243533", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.273558", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.025378478690981865, "timestamp": "2025-09-15 03:19:14.297318", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.327983", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.0185211431235075, "timestamp": "2025-09-15 03:19:14.330186", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.360470", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.029448067769408226, "timestamp": "2025-09-15 03:19:14.362492", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.392848", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.03555438667535782, "timestamp": "2025-09-15 03:19:14.395026", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.426099", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.0238653514534235, "timestamp": "2025-09-15 03:19:14.449561", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.479231", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.03759421780705452, "timestamp": "2025-09-15 03:19:14.481329", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.511305", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.026176365092396736, "timestamp": "2025-09-15 03:19:14.513427", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.543093", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.02880324050784111, "timestamp": "2025-09-15 03:19:14.545168", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.575392", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.012668220326304436, "timestamp": "2025-09-15 03:19:14.598912", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.629312", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.017717767506837845, "timestamp": "2025-09-15 03:19:14.631250", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.661090", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.016119273379445076, "timestamp": "2025-09-15 03:19:14.663260", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.692787", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.011544623412191868, "timestamp": "2025-09-15 03:19:14.694825", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:14.724842", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.016963688656687737, "timestamp": "2025-09-15 03:19:14.748390", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:14.778525", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.0260551106184721, "timestamp": "2025-09-15 03:19:14.780863", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.812461", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.029832622036337852, "timestamp": "2025-09-15 03:19:14.814710", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.845126", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.020663557574152946, "timestamp": "2025-09-15 03:19:14.847561", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.878419", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.012055516242980957, "timestamp": "2025-09-15 03:19:14.902621", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:14.933484", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.02465776540338993, "timestamp": "2025-09-15 03:19:14.936922", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:14.967549", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.014459922909736633, "timestamp": "2025-09-15 03:19:14.970185", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:15.000758", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.017588822171092033, "timestamp": "2025-09-15 03:19:15.003192", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.033480", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.028651872649788857, "timestamp": "2025-09-15 03:19:15.056929", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.087563", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.022055622190237045, "timestamp": "2025-09-15 03:19:15.089746", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.120493", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.025541191920638084, "timestamp": "2025-09-15 03:19:15.122779", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.153139", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.01346271950751543, "timestamp": "2025-09-15 03:19:15.155147", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.184446", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.029344525188207626, "timestamp": "2025-09-15 03:19:15.208168", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.238243", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.02545652911067009, "timestamp": "2025-09-15 03:19:15.240521", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.270660", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.02384101040661335, "timestamp": "2025-09-15 03:19:15.272978", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.303721", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.01691906340420246, "timestamp": "2025-09-15 03:19:15.305806", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.336539", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.014407488517463207, "timestamp": "2025-09-15 03:19:15.360275", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.390488", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.01607523113489151, "timestamp": "2025-09-15 03:19:15.393291", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.422854", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.022174837067723274, "timestamp": "2025-09-15 03:19:15.425325", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.455251", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.019380640238523483, "timestamp": "2025-09-15 03:19:15.457431", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.487035", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.01895737834274769, "timestamp": "2025-09-15 03:19:15.510478", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.540937", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.02109598182141781, "timestamp": "2025-09-15 03:19:15.543200", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:15.573164", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.021322503685951233, "timestamp": "2025-09-15 03:19:15.575138", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.604655", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.016983283683657646, "timestamp": "2025-09-15 03:19:15.606731", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.636218", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.023025959730148315, "timestamp": "2025-09-15 03:19:15.659533", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.689472", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.01357300765812397, "timestamp": "2025-09-15 03:19:15.691565", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.721118", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.01553942821919918, "timestamp": "2025-09-15 03:19:15.723381", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.753008", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.016792044043540955, "timestamp": "2025-09-15 03:19:15.755077", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.784626", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.02156914956867695, "timestamp": "2025-09-15 03:19:15.808187", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:15.856953", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.017255697399377823, "timestamp": "2025-09-15 03:19:15.859593", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.889577", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.030908426269888878, "timestamp": "2025-09-15 03:19:15.891774", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.921381", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.035414181649684906, "timestamp": "2025-09-15 03:19:15.923879", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:15.953734", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.013454231433570385, "timestamp": "2025-09-15 03:19:15.977493", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.007644", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.01388184167444706, "timestamp": "2025-09-15 03:19:16.009665", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:16.039889", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.018477659672498703, "timestamp": "2025-09-15 03:19:16.042544", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.073246", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.00951298326253891, "timestamp": "2025-09-15 03:19:16.075369", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.105573", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.022019732743501663, "timestamp": "2025-09-15 03:19:16.128995", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.158640", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.007529764901846647, "timestamp": "2025-09-15 03:19:16.160824", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:16.190862", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.013377663679420948, "timestamp": "2025-09-15 03:19:16.193028", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.222718", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.02092798426747322, "timestamp": "2025-09-15 03:19:16.224930", "step": 399, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:16.937105", "step": 399, "epoch": 1 }, { "type": "pplx", "content": 106969624.85667004, "timestamp": "2025-09-15 03:19:16.938941", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:16.968244", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.016222558915615082, "timestamp": "2025-09-15 03:19:16.991840", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.022374", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.006949161179363728, "timestamp": "2025-09-15 03:19:17.024691", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:17.054915", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.021258166059851646, "timestamp": "2025-09-15 03:19:17.057139", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.087178", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.02063274383544922, "timestamp": "2025-09-15 03:19:17.089528", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.119255", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.0355384424328804, "timestamp": "2025-09-15 03:19:17.143008", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.173120", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.02840801514685154, "timestamp": "2025-09-15 03:19:17.175236", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.204937", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.035722844302654266, "timestamp": "2025-09-15 03:19:17.207127", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.236656", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.009866001084446907, "timestamp": "2025-09-15 03:19:17.241929", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:17.274706", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.007772717159241438, "timestamp": "2025-09-15 03:19:17.298620", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.328616", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.02096855826675892, "timestamp": "2025-09-15 03:19:17.330936", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.362169", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.021432984620332718, "timestamp": "2025-09-15 03:19:17.364656", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.395211", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.016841476783156395, "timestamp": "2025-09-15 03:19:17.397416", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:17.427588", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.03731584548950195, "timestamp": "2025-09-15 03:19:17.451200", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.481242", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.010997510515153408, "timestamp": "2025-09-15 03:19:17.483095", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.513132", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.03299867734313011, "timestamp": "2025-09-15 03:19:17.515254", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.544807", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.04234781861305237, "timestamp": "2025-09-15 03:19:17.546814", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.577458", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.008629407733678818, "timestamp": "2025-09-15 03:19:17.601149", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.631045", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.013242745772004128, "timestamp": "2025-09-15 03:19:17.633065", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.663210", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.02678370475769043, "timestamp": "2025-09-15 03:19:17.665528", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.695230", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.02190086431801319, "timestamp": "2025-09-15 03:19:17.697529", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.727255", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.01396601740270853, "timestamp": "2025-09-15 03:19:17.750708", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.780889", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.015750134363770485, "timestamp": "2025-09-15 03:19:17.782968", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.812631", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.014036260545253754, "timestamp": "2025-09-15 03:19:17.814594", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:17.845147", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.019544070586562157, "timestamp": "2025-09-15 03:19:17.847507", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.877464", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.01638009026646614, "timestamp": "2025-09-15 03:19:17.900894", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:17.931143", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.01974562369287014, "timestamp": "2025-09-15 03:19:17.933286", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:17.963320", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.022785307839512825, "timestamp": "2025-09-15 03:19:17.965504", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:17.995921", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.019113395363092422, "timestamp": "2025-09-15 03:19:17.998035", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.028032", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.013610420748591423, "timestamp": "2025-09-15 03:19:18.051538", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.081768", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.02456537075340748, "timestamp": "2025-09-15 03:19:18.083833", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:18.115577", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.02188361994922161, "timestamp": "2025-09-15 03:19:18.118002", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.148465", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.013410343788564205, "timestamp": "2025-09-15 03:19:18.150469", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:18.180379", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.02152617648243904, "timestamp": "2025-09-15 03:19:18.204057", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.234081", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.017813067883253098, "timestamp": "2025-09-15 03:19:18.236156", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:18.266139", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.02423473261296749, "timestamp": "2025-09-15 03:19:18.268706", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.298971", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.01806381531059742, "timestamp": "2025-09-15 03:19:18.301299", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.331157", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.01698886603116989, "timestamp": "2025-09-15 03:19:18.355447", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.385615", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.025869715958833694, "timestamp": "2025-09-15 03:19:18.388025", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.417735", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.02636927366256714, "timestamp": "2025-09-15 03:19:18.420056", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.449967", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.04045248404145241, "timestamp": "2025-09-15 03:19:18.452124", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.481852", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.02000604197382927, "timestamp": "2025-09-15 03:19:18.505406", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.535658", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.03176813945174217, "timestamp": "2025-09-15 03:19:18.537779", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.567747", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.03146418556571007, "timestamp": "2025-09-15 03:19:18.569892", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:18.604064", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.012470101937651634, "timestamp": "2025-09-15 03:19:18.606130", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:18.636695", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.03529360517859459, "timestamp": "2025-09-15 03:19:18.660438", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.690103", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.010506563819944859, "timestamp": "2025-09-15 03:19:18.692093", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.721987", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.019400015473365784, "timestamp": "2025-09-15 03:19:18.724012", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:18.754433", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.007648042868822813, "timestamp": "2025-09-15 03:19:18.756768", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.786715", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.013138832524418831, "timestamp": "2025-09-15 03:19:18.810090", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:18.840475", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.008471040055155754, "timestamp": "2025-09-15 03:19:18.842879", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.873896", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.030580993741750717, "timestamp": "2025-09-15 03:19:18.876001", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.906957", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.021732794120907784, "timestamp": "2025-09-15 03:19:18.909053", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.939066", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.008800463750958443, "timestamp": "2025-09-15 03:19:18.964526", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:18.995324", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.017960723489522934, "timestamp": "2025-09-15 03:19:18.997206", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:19.027401", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.019798072054982185, "timestamp": "2025-09-15 03:19:19.030007", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:19.060014", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.0256817489862442, "timestamp": "2025-09-15 03:19:19.062247", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:19.092101", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.012163102626800537, "timestamp": "2025-09-15 03:19:19.115459", "step": 456, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:19.830229", "step": 456, "epoch": 1 }, { "type": "pplx", "content": 106593515.23342364, "timestamp": "2025-09-15 03:19:19.832687", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:19.860789", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.022691946476697922, "timestamp": "2025-09-15 03:19:19.862834", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:19.894401", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.033485524356365204, "timestamp": "2025-09-15 03:19:19.896450", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:19.926149", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.011042545549571514, "timestamp": "2025-09-15 03:19:19.928445", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:19.958705", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.032331619411706924, "timestamp": "2025-09-15 03:19:19.982465", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.012147", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.018903549760580063, "timestamp": "2025-09-15 03:19:20.014388", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.044231", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.011640116572380066, "timestamp": "2025-09-15 03:19:20.046361", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.076555", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.009794848039746284, "timestamp": "2025-09-15 03:19:20.078607", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.108563", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.02081950567662716, "timestamp": "2025-09-15 03:19:20.132053", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.162248", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.01612214185297489, "timestamp": "2025-09-15 03:19:20.164354", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.194560", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.014096572063863277, "timestamp": "2025-09-15 03:19:20.196538", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.226672", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.019644200801849365, "timestamp": "2025-09-15 03:19:20.228929", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.258638", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.02487204596400261, "timestamp": "2025-09-15 03:19:20.282026", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.311901", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.02327965758740902, "timestamp": "2025-09-15 03:19:20.314077", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.343694", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.03325268253684044, "timestamp": "2025-09-15 03:19:20.345817", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.375868", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.0069459849037230015, "timestamp": "2025-09-15 03:19:20.378369", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.408212", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.01827104203402996, "timestamp": "2025-09-15 03:19:20.431708", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:20.462068", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.019391119480133057, "timestamp": "2025-09-15 03:19:20.464671", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:20.494805", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.013398611918091774, "timestamp": "2025-09-15 03:19:20.497228", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:20.526884", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.011475841514766216, "timestamp": "2025-09-15 03:19:20.529188", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.558975", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.012057404033839703, "timestamp": "2025-09-15 03:19:20.582317", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.611838", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.013628379441797733, "timestamp": "2025-09-15 03:19:20.614030", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.644198", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.013962375931441784, "timestamp": "2025-09-15 03:19:20.646347", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.676826", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.014084680937230587, "timestamp": "2025-09-15 03:19:20.678829", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.708906", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.007172218058258295, "timestamp": "2025-09-15 03:19:20.732339", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.761832", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.02609698474407196, "timestamp": "2025-09-15 03:19:20.763996", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.793904", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.027867257595062256, "timestamp": "2025-09-15 03:19:20.795800", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:20.826089", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.012075444683432579, "timestamp": "2025-09-15 03:19:20.828206", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.858430", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.01756882481276989, "timestamp": "2025-09-15 03:19:20.883340", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:20.913996", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.01686927303671837, "timestamp": "2025-09-15 03:19:20.916170", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:20.945871", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.013569544069468975, "timestamp": "2025-09-15 03:19:20.948173", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:20.978420", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.027030980214476585, "timestamp": "2025-09-15 03:19:20.980683", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.011351", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.021866770461201668, "timestamp": "2025-09-15 03:19:21.034950", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.064837", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.015595144592225552, "timestamp": "2025-09-15 03:19:21.066964", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:21.096985", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.011602642014622688, "timestamp": "2025-09-15 03:19:21.099366", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.128752", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.01822063885629177, "timestamp": "2025-09-15 03:19:21.130765", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:21.160810", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.008895600214600563, "timestamp": "2025-09-15 03:19:21.184264", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.214166", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.008032863028347492, "timestamp": "2025-09-15 03:19:21.216142", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.245758", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.03705013543367386, "timestamp": "2025-09-15 03:19:21.247899", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.277937", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.012541859410703182, "timestamp": "2025-09-15 03:19:21.280004", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.312454", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.01749964989721775, "timestamp": "2025-09-15 03:19:21.336261", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:21.366956", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.00825695414096117, "timestamp": "2025-09-15 03:19:21.369467", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:21.399465", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.0126886498183012, "timestamp": "2025-09-15 03:19:21.401727", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.432115", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.01961720548570156, "timestamp": "2025-09-15 03:19:21.434180", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:21.463839", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.006928745657205582, "timestamp": "2025-09-15 03:19:21.487376", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-09-15 03:19:27.943224", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:27.980027", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.00608115503564477, "timestamp": "2025-09-15 03:19:27.982317", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:28.013857", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.018389714881777763, "timestamp": "2025-09-15 03:19:28.016328", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.046536", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.014615455642342567, "timestamp": "2025-09-15 03:19:28.048877", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.079057", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.018191883340477943, "timestamp": "2025-09-15 03:19:28.102793", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.132932", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.00259787798859179, "timestamp": "2025-09-15 03:19:28.135309", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.165002", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.050298064947128296, "timestamp": "2025-09-15 03:19:28.167124", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:28.197378", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.028524816036224365, "timestamp": "2025-09-15 03:19:28.199756", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:28.230000", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.03299025818705559, "timestamp": "2025-09-15 03:19:28.253871", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.284729", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.005647748708724976, "timestamp": "2025-09-15 03:19:28.287110", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.317178", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.01253785751760006, "timestamp": "2025-09-15 03:19:28.319089", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.349076", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.03369123488664627, "timestamp": "2025-09-15 03:19:28.351251", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:28.382327", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.003224168438464403, "timestamp": "2025-09-15 03:19:28.406167", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:28.436996", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.02117718942463398, "timestamp": "2025-09-15 03:19:28.439068", "step": 513, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:29.155360", "step": 513, "epoch": 1 }, { "type": "pplx", "content": 103196321.15215471, "timestamp": "2025-09-15 03:19:29.157619", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.186536", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.005289103835821152, "timestamp": "2025-09-15 03:19:29.188811", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.219031", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.019699811935424805, "timestamp": "2025-09-15 03:19:29.222466", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.253330", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.025011127814650536, "timestamp": "2025-09-15 03:19:29.276960", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:29.306759", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.032631583511829376, "timestamp": "2025-09-15 03:19:29.308971", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.338686", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.027445856481790543, "timestamp": "2025-09-15 03:19:29.340803", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.370764", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.01845047064125538, "timestamp": "2025-09-15 03:19:29.372918", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.403375", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.00743445660918951, "timestamp": "2025-09-15 03:19:29.427047", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.456855", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.02367217466235161, "timestamp": "2025-09-15 03:19:29.459092", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.488834", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.022589536383748055, "timestamp": "2025-09-15 03:19:29.491088", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.521721", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.007371582556515932, "timestamp": "2025-09-15 03:19:29.523929", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.554220", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.0034837510902434587, "timestamp": "2025-09-15 03:19:29.578015", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.608232", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.012912024743855, "timestamp": "2025-09-15 03:19:29.610343", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.641765", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.02050393633544445, "timestamp": "2025-09-15 03:19:29.643979", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.673935", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.009708769619464874, "timestamp": "2025-09-15 03:19:29.676201", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.705996", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.014362855814397335, "timestamp": "2025-09-15 03:19:29.729561", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.759730", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.009035781025886536, "timestamp": "2025-09-15 03:19:29.762308", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.792112", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.007510147523134947, "timestamp": "2025-09-15 03:19:29.794341", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.824151", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.0246110912412405, "timestamp": "2025-09-15 03:19:29.826313", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.856838", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.015608777292072773, "timestamp": "2025-09-15 03:19:29.880601", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:29.910643", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.01901078037917614, "timestamp": "2025-09-15 03:19:29.912820", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.942909", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.036900751292705536, "timestamp": "2025-09-15 03:19:29.945423", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:29.975951", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.02328217215836048, "timestamp": "2025-09-15 03:19:29.978717", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.009678", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.00571950851008296, "timestamp": "2025-09-15 03:19:30.033333", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.064355", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.036896418780088425, "timestamp": "2025-09-15 03:19:30.066492", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.096238", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.015397454611957073, "timestamp": "2025-09-15 03:19:30.098994", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.129739", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.057132985442876816, "timestamp": "2025-09-15 03:19:30.133129", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.163818", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.017368067055940628, "timestamp": "2025-09-15 03:19:30.188068", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.218982", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.033790573477745056, "timestamp": "2025-09-15 03:19:30.221931", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.252705", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.014461012557148933, "timestamp": "2025-09-15 03:19:30.255093", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.285723", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.005903469864279032, "timestamp": "2025-09-15 03:19:30.288147", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.318215", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.0222102589905262, "timestamp": "2025-09-15 03:19:30.342045", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.372445", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.030389001592993736, "timestamp": "2025-09-15 03:19:30.374463", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.404540", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.030530178919434547, "timestamp": "2025-09-15 03:19:30.406642", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.436463", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.043474406003952026, "timestamp": "2025-09-15 03:19:30.438647", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:30.469399", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.025674572214484215, "timestamp": "2025-09-15 03:19:30.493123", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.523887", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.039770692586898804, "timestamp": "2025-09-15 03:19:30.526123", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.555752", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.028351102024316788, "timestamp": "2025-09-15 03:19:30.558088", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.587947", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.024992715567350388, "timestamp": "2025-09-15 03:19:30.590513", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.620647", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.0335613451898098, "timestamp": "2025-09-15 03:19:30.644598", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.674895", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.03769141435623169, "timestamp": "2025-09-15 03:19:30.677221", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.707719", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.0256858728826046, "timestamp": "2025-09-15 03:19:30.710855", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.743064", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.008078259415924549, "timestamp": "2025-09-15 03:19:30.745361", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.775339", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.033207204192876816, "timestamp": "2025-09-15 03:19:30.798821", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:30.828644", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.027974357828497887, "timestamp": "2025-09-15 03:19:30.831194", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.861194", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.017816243693232536, "timestamp": "2025-09-15 03:19:30.868426", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.898405", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.026075145229697227, "timestamp": "2025-09-15 03:19:30.900764", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:30.943455", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.010549420490860939, "timestamp": "2025-09-15 03:19:30.967281", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:30.997163", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.01911732740700245, "timestamp": "2025-09-15 03:19:30.999762", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.029528", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.017765216529369354, "timestamp": "2025-09-15 03:19:31.031797", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.061228", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.018333345651626587, "timestamp": "2025-09-15 03:19:31.063508", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.093491", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.026418427005410194, "timestamp": "2025-09-15 03:19:31.117230", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:31.147881", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.02311128005385399, "timestamp": "2025-09-15 03:19:31.150294", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.180101", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.018931102007627487, "timestamp": "2025-09-15 03:19:31.182476", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.212574", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.017706727609038353, "timestamp": "2025-09-15 03:19:31.214942", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:31.245282", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.022706808522343636, "timestamp": "2025-09-15 03:19:31.270109", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.302088", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.03452968969941139, "timestamp": "2025-09-15 03:19:31.304874", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:31.334973", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.01722005195915699, "timestamp": "2025-09-15 03:19:31.337600", "step": 570, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:32.064332", "step": 570, "epoch": 1 }, { "type": "pplx", "content": 121926762.5949161, "timestamp": "2025-09-15 03:19:32.066255", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.095386", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.015566927380859852, "timestamp": "2025-09-15 03:19:32.097639", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.127943", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.01759842224419117, "timestamp": "2025-09-15 03:19:32.151554", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:32.181411", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.0242477860301733, "timestamp": "2025-09-15 03:19:32.183399", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.214246", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.01448657363653183, "timestamp": "2025-09-15 03:19:32.216438", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.246880", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.02040638029575348, "timestamp": "2025-09-15 03:19:32.248954", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.279047", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.016124863177537918, "timestamp": "2025-09-15 03:19:32.302415", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:32.332806", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.021785302087664604, "timestamp": "2025-09-15 03:19:32.334799", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:32.372166", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.0231538824737072, "timestamp": "2025-09-15 03:19:32.374930", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.405741", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.01654941961169243, "timestamp": "2025-09-15 03:19:32.409225", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.440079", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.013618906028568745, "timestamp": "2025-09-15 03:19:32.463532", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:32.494012", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.016907773911952972, "timestamp": "2025-09-15 03:19:32.496305", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.526226", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.012379102408885956, "timestamp": "2025-09-15 03:19:32.529435", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:32.561344", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.022719666361808777, "timestamp": "2025-09-15 03:19:32.563677", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:32.593679", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.01793176494538784, "timestamp": "2025-09-15 03:19:32.617414", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.648356", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.016040056943893433, "timestamp": "2025-09-15 03:19:32.650958", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.681070", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.01226879097521305, "timestamp": "2025-09-15 03:19:32.683566", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.713784", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.0086965998634696, "timestamp": "2025-09-15 03:19:32.716034", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:32.745741", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.026391183957457542, "timestamp": "2025-09-15 03:19:32.769225", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.798696", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.024788683280348778, "timestamp": "2025-09-15 03:19:32.800849", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.830603", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.02367064729332924, "timestamp": "2025-09-15 03:19:32.832586", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:19:32.880943", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.02348589338362217, "timestamp": "2025-09-15 03:19:32.883116", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.913202", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.017809707671403885, "timestamp": "2025-09-15 03:19:32.937348", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:32.967478", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.02226695977151394, "timestamp": "2025-09-15 03:19:32.969704", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:33.000156", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.01061128918081522, "timestamp": "2025-09-15 03:19:33.002061", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.031555", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.018722638487815857, "timestamp": "2025-09-15 03:19:33.033744", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.065088", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.0316433385014534, "timestamp": "2025-09-15 03:19:33.088691", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:33.119482", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.020807506516575813, "timestamp": "2025-09-15 03:19:33.121812", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.151699", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.016805419698357582, "timestamp": "2025-09-15 03:19:33.153659", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:33.184121", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.03364928811788559, "timestamp": "2025-09-15 03:19:33.186308", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.216288", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.02124728076159954, "timestamp": "2025-09-15 03:19:33.240750", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.270946", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.02416917122900486, "timestamp": "2025-09-15 03:19:33.273304", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.303700", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.022851431742310524, "timestamp": "2025-09-15 03:19:33.305762", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.336610", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.014495198614895344, "timestamp": "2025-09-15 03:19:33.338762", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.371483", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.02507566474378109, "timestamp": "2025-09-15 03:19:33.394809", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.425072", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.019076064229011536, "timestamp": "2025-09-15 03:19:33.427298", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:33.457171", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.018669435754418373, "timestamp": "2025-09-15 03:19:33.459227", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.489602", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.018617525696754456, "timestamp": "2025-09-15 03:19:33.491896", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:33.523534", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.015499304048717022, "timestamp": "2025-09-15 03:19:33.547160", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.577007", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.019901065155863762, "timestamp": "2025-09-15 03:19:33.579301", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:33.609797", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.019307816401124, "timestamp": "2025-09-15 03:19:33.612044", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.642385", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.02826797403395176, "timestamp": "2025-09-15 03:19:33.644622", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.674634", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.03018580749630928, "timestamp": "2025-09-15 03:19:33.698326", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.728321", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.023904601112008095, "timestamp": "2025-09-15 03:19:33.730297", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.760108", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.014215878210961819, "timestamp": "2025-09-15 03:19:33.762305", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.792435", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.031668949872255325, "timestamp": "2025-09-15 03:19:33.794748", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.825077", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.020902784541249275, "timestamp": "2025-09-15 03:19:33.848960", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:33.878880", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.032585740089416504, "timestamp": "2025-09-15 03:19:33.881426", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.911931", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.016043754294514656, "timestamp": "2025-09-15 03:19:33.913997", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.944384", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.024471482262015343, "timestamp": "2025-09-15 03:19:33.946630", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:33.979520", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.016859030351042747, "timestamp": "2025-09-15 03:19:34.003549", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:34.033537", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.016345178708434105, "timestamp": "2025-09-15 03:19:34.035506", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:34.065306", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.015494337305426598, "timestamp": "2025-09-15 03:19:34.067448", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.097216", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.016618309542536736, "timestamp": "2025-09-15 03:19:34.099186", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.128811", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.03520876169204712, "timestamp": "2025-09-15 03:19:34.152306", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.184480", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.017803248018026352, "timestamp": "2025-09-15 03:19:34.186904", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.216925", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.011919400654733181, "timestamp": "2025-09-15 03:19:34.218932", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.249088", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.006866747047752142, "timestamp": "2025-09-15 03:19:34.251058", "step": 627, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:34.966517", "step": 627, "epoch": 1 }, { "type": "pplx", "content": 113143370.79268838, "timestamp": "2025-09-15 03:19:34.968536", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:34.997423", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.009564869105815887, "timestamp": "2025-09-15 03:19:35.021678", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.052415", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.0398293137550354, "timestamp": "2025-09-15 03:19:35.054331", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.088216", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.030554279685020447, "timestamp": "2025-09-15 03:19:35.090620", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.121531", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.01749667339026928, "timestamp": "2025-09-15 03:19:35.123672", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.154520", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.015980729833245277, "timestamp": "2025-09-15 03:19:35.180987", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.219404", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.005738109350204468, "timestamp": "2025-09-15 03:19:35.222413", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.252946", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.009976466186344624, "timestamp": "2025-09-15 03:19:35.254880", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.285492", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.02471822127699852, "timestamp": "2025-09-15 03:19:35.287618", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.317306", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.008112783543765545, "timestamp": "2025-09-15 03:19:35.341896", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.372410", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.0037000139709562063, "timestamp": "2025-09-15 03:19:35.374188", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.404221", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.032425593584775925, "timestamp": "2025-09-15 03:19:35.406151", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.436237", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.032084185630083084, "timestamp": "2025-09-15 03:19:35.440432", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.469637", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.003243770683184266, "timestamp": "2025-09-15 03:19:35.499485", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.529955", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.009236874990165234, "timestamp": "2025-09-15 03:19:35.532339", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.562536", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.016537828370928764, "timestamp": "2025-09-15 03:19:35.564759", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.599429", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.007999381981790066, "timestamp": "2025-09-15 03:19:35.601425", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.631660", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.03624844551086426, "timestamp": "2025-09-15 03:19:35.655749", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.687889", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.0090530039742589, "timestamp": "2025-09-15 03:19:35.689793", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.719723", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.03531469404697418, "timestamp": "2025-09-15 03:19:35.721763", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.752701", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.007365099154412746, "timestamp": "2025-09-15 03:19:35.755023", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:35.785323", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.006562412716448307, "timestamp": "2025-09-15 03:19:35.808615", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:35.838453", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.0090226074680686, "timestamp": "2025-09-15 03:19:35.840473", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:35.870557", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.006006971467286348, "timestamp": "2025-09-15 03:19:35.872732", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.902827", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.017953816801309586, "timestamp": "2025-09-15 03:19:35.904940", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.934800", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.014853633008897305, "timestamp": "2025-09-15 03:19:35.958290", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:35.988599", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.01929437555372715, "timestamp": "2025-09-15 03:19:35.990404", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.020135", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.027933437377214432, "timestamp": "2025-09-15 03:19:36.022258", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:36.053501", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.007303243037313223, "timestamp": "2025-09-15 03:19:36.056158", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.086539", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.005802966188639402, "timestamp": "2025-09-15 03:19:36.109977", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.139863", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.0036463297437876463, "timestamp": "2025-09-15 03:19:36.142126", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:36.171715", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.01936980150640011, "timestamp": "2025-09-15 03:19:36.173881", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:36.206242", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.005909489002078772, "timestamp": "2025-09-15 03:19:36.208350", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.237937", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.011962154880166054, "timestamp": "2025-09-15 03:19:36.261456", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.292248", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.010646837763488293, "timestamp": "2025-09-15 03:19:36.294001", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.324433", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.004621988628059626, "timestamp": "2025-09-15 03:19:36.326545", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.356674", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.01572437211871147, "timestamp": "2025-09-15 03:19:36.359594", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:36.389546", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.01924065686762333, "timestamp": "2025-09-15 03:19:36.413356", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.443333", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.009026370011270046, "timestamp": "2025-09-15 03:19:36.445178", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.475161", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.006555170752108097, "timestamp": "2025-09-15 03:19:36.477114", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.508774", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.012694472447037697, "timestamp": "2025-09-15 03:19:36.510930", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.541487", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.012330038473010063, "timestamp": "2025-09-15 03:19:36.565046", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.595113", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.04121780022978783, "timestamp": "2025-09-15 03:19:36.597217", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:36.627506", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.05309586599469185, "timestamp": "2025-09-15 03:19:36.629586", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.659708", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.006368253845721483, "timestamp": "2025-09-15 03:19:36.662648", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.692272", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.019130660220980644, "timestamp": "2025-09-15 03:19:36.715694", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.746500", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.024251120164990425, "timestamp": "2025-09-15 03:19:36.748928", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.778732", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.025649379938840866, "timestamp": "2025-09-15 03:19:36.780936", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.810782", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.04780502989888191, "timestamp": "2025-09-15 03:19:36.812741", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.843220", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.012879802845418453, "timestamp": "2025-09-15 03:19:36.866538", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.896279", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.0165265966206789, "timestamp": "2025-09-15 03:19:36.898458", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:36.928417", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.009652511216700077, "timestamp": "2025-09-15 03:19:36.930824", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.961659", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.008517156355082989, "timestamp": "2025-09-15 03:19:36.963827", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:36.994853", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.012086872942745686, "timestamp": "2025-09-15 03:19:37.018261", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:37.047953", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.01267678290605545, "timestamp": "2025-09-15 03:19:37.050126", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:37.080491", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.014379492029547691, "timestamp": "2025-09-15 03:19:37.082330", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:37.111970", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.018130462616682053, "timestamp": "2025-09-15 03:19:37.113977", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:37.144365", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.009595629759132862, "timestamp": "2025-09-15 03:19:37.168484", "step": 684, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:37.880121", "step": 684, "epoch": 1 }, { "type": "pplx", "content": 110478225.79936634, "timestamp": "2025-09-15 03:19:37.882397", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:37.910669", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.03872322663664818, "timestamp": "2025-09-15 03:19:37.912785", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:37.942433", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.018750173971056938, "timestamp": "2025-09-15 03:19:37.944777", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:37.975193", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.0061281099915504456, "timestamp": "2025-09-15 03:19:37.978429", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.008224", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.006822456140071154, "timestamp": "2025-09-15 03:19:38.031688", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.061859", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.04175681620836258, "timestamp": "2025-09-15 03:19:38.063998", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.095345", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.00807065051048994, "timestamp": "2025-09-15 03:19:38.097520", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.129154", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.007711734157055616, "timestamp": "2025-09-15 03:19:38.131369", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.160717", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.00910022109746933, "timestamp": "2025-09-15 03:19:38.184220", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.214151", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.027539242058992386, "timestamp": "2025-09-15 03:19:38.216324", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.246949", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.047927696257829666, "timestamp": "2025-09-15 03:19:38.249186", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.278947", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.0026730988174676895, "timestamp": "2025-09-15 03:19:38.281425", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:38.311921", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.024083703756332397, "timestamp": "2025-09-15 03:19:38.335624", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.375662", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.008579043671488762, "timestamp": "2025-09-15 03:19:38.378574", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.409372", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.006062888540327549, "timestamp": "2025-09-15 03:19:38.412058", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.442720", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.023601248860359192, "timestamp": "2025-09-15 03:19:38.445222", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.476156", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.04036266356706619, "timestamp": "2025-09-15 03:19:38.499944", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:38.531441", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.00857632141560316, "timestamp": "2025-09-15 03:19:38.533626", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.564681", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.01594799943268299, "timestamp": "2025-09-15 03:19:38.566998", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.597747", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.0195697583258152, "timestamp": "2025-09-15 03:19:38.600011", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.630500", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.02024012990295887, "timestamp": "2025-09-15 03:19:38.654044", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:38.684706", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.022832201793789864, "timestamp": "2025-09-15 03:19:38.688201", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:38.718266", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.004162064287811518, "timestamp": "2025-09-15 03:19:38.720720", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.751034", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.0038514018524438143, "timestamp": "2025-09-15 03:19:38.753103", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.783498", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.01295088417828083, "timestamp": "2025-09-15 03:19:38.807224", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.838316", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.012839587405323982, "timestamp": "2025-09-15 03:19:38.840761", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.870848", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.04741484671831131, "timestamp": "2025-09-15 03:19:38.873113", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:38.903902", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.025619518011808395, "timestamp": "2025-09-15 03:19:38.906239", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.937428", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.03996429219841957, "timestamp": "2025-09-15 03:19:38.961599", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:38.991884", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.016623103991150856, "timestamp": "2025-09-15 03:19:38.994023", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.023874", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.010368691757321358, "timestamp": "2025-09-15 03:19:39.026689", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:39.056698", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.022392038255929947, "timestamp": "2025-09-15 03:19:39.058863", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.088619", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.03169969096779823, "timestamp": "2025-09-15 03:19:39.112653", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.143062", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.01695393957197666, "timestamp": "2025-09-15 03:19:39.145326", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.175095", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.020151188597083092, "timestamp": "2025-09-15 03:19:39.177433", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.208063", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.02938300557434559, "timestamp": "2025-09-15 03:19:39.210073", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:39.241620", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.03406095132231712, "timestamp": "2025-09-15 03:19:39.265302", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.295237", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.012514782138168812, "timestamp": "2025-09-15 03:19:39.297712", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.327634", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.013856396079063416, "timestamp": "2025-09-15 03:19:39.329995", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.360124", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.01550250593572855, "timestamp": "2025-09-15 03:19:39.362316", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.392400", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.02845914289355278, "timestamp": "2025-09-15 03:19:39.416151", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.446701", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.027011645957827568, "timestamp": "2025-09-15 03:19:39.449256", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:39.479549", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.022153319790959358, "timestamp": "2025-09-15 03:19:39.481807", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:39.512515", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.017665695399045944, "timestamp": "2025-09-15 03:19:39.514908", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.546049", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.015788642689585686, "timestamp": "2025-09-15 03:19:39.569513", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.599192", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.009126792661845684, "timestamp": "2025-09-15 03:19:39.601371", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.631684", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.02464197389781475, "timestamp": "2025-09-15 03:19:39.633942", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.663815", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.01288971770554781, "timestamp": "2025-09-15 03:19:39.665990", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.696417", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.023708462715148926, "timestamp": "2025-09-15 03:19:39.720257", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:39.751029", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.018586965277791023, "timestamp": "2025-09-15 03:19:39.753307", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:39.783625", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.015598964877426624, "timestamp": "2025-09-15 03:19:39.785775", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:39.816683", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.017110329121351242, "timestamp": "2025-09-15 03:19:39.819271", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.849995", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.013492013327777386, "timestamp": "2025-09-15 03:19:39.873602", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.903812", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.03329331800341606, "timestamp": "2025-09-15 03:19:39.905917", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.935792", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.009817318990826607, "timestamp": "2025-09-15 03:19:39.938021", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:39.968219", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.022194471210241318, "timestamp": "2025-09-15 03:19:39.970551", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:40.000627", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.031265296041965485, "timestamp": "2025-09-15 03:19:40.024156", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:40.054912", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.01729530841112137, "timestamp": "2025-09-15 03:19:40.057327", "step": 741, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:40.778644", "step": 741, "epoch": 1 }, { "type": "pplx", "content": 103717247.03669673, "timestamp": "2025-09-15 03:19:40.780588", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:40.808719", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.010400855913758278, "timestamp": "2025-09-15 03:19:40.810891", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:40.840610", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.015962064266204834, "timestamp": "2025-09-15 03:19:40.843064", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:40.873807", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.03265008702874184, "timestamp": "2025-09-15 03:19:40.897447", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:40.927413", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.02071802131831646, "timestamp": "2025-09-15 03:19:40.929460", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:40.960245", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.007692706771194935, "timestamp": "2025-09-15 03:19:40.962279", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:40.992394", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.007905025035142899, "timestamp": "2025-09-15 03:19:40.994567", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.024483", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.02939099445939064, "timestamp": "2025-09-15 03:19:41.047734", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.078688", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.01641055755317211, "timestamp": "2025-09-15 03:19:41.080807", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.111344", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.010218380019068718, "timestamp": "2025-09-15 03:19:41.113405", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.143853", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.02515035681426525, "timestamp": "2025-09-15 03:19:41.145967", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.175812", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.0318562313914299, "timestamp": "2025-09-15 03:19:41.199260", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.229048", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.017711641266942024, "timestamp": "2025-09-15 03:19:41.231205", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:41.261072", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.01047059241682291, "timestamp": "2025-09-15 03:19:41.264903", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:41.295027", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.013568080961704254, "timestamp": "2025-09-15 03:19:41.297049", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.327048", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.025309359654784203, "timestamp": "2025-09-15 03:19:41.350361", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.379873", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.007763172034174204, "timestamp": "2025-09-15 03:19:41.381794", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.412033", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.007784743793308735, "timestamp": "2025-09-15 03:19:41.414104", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.443993", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.011267587542533875, "timestamp": "2025-09-15 03:19:41.446425", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.476903", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.013863942585885525, "timestamp": "2025-09-15 03:19:41.500438", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.531170", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.006926204077899456, "timestamp": "2025-09-15 03:19:41.533280", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:19:41.563986", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.01246979646384716, "timestamp": "2025-09-15 03:19:41.566514", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.596790", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.029222693294286728, "timestamp": "2025-09-15 03:19:41.598803", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.629258", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.008983985520899296, "timestamp": "2025-09-15 03:19:41.652444", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.682490", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.010987072251737118, "timestamp": "2025-09-15 03:19:41.684608", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.714931", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.013884793035686016, "timestamp": "2025-09-15 03:19:41.716900", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.747942", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.007951878942549229, "timestamp": "2025-09-15 03:19:41.750087", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.782095", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.034430526196956635, "timestamp": "2025-09-15 03:19:41.805391", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.836765", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.01271742768585682, "timestamp": "2025-09-15 03:19:41.838870", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.869259", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.03543007746338844, "timestamp": "2025-09-15 03:19:41.871586", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:41.901840", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.021944111213088036, "timestamp": "2025-09-15 03:19:41.903746", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:41.934061", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.008409752510488033, "timestamp": "2025-09-15 03:19:41.957375", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:41.987312", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.04508158937096596, "timestamp": "2025-09-15 03:19:41.989251", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.019250", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.006456805858761072, "timestamp": "2025-09-15 03:19:42.021464", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:42.051437", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.028556156903505325, "timestamp": "2025-09-15 03:19:42.053388", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:42.083163", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.027395443990826607, "timestamp": "2025-09-15 03:19:42.106623", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:42.136557", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.006619160529226065, "timestamp": "2025-09-15 03:19:42.138621", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.168555", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.003444958943873644, "timestamp": "2025-09-15 03:19:42.170628", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:42.200353", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.02690565027296543, "timestamp": "2025-09-15 03:19:42.202531", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.232264", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.006759346928447485, "timestamp": "2025-09-15 03:19:42.255684", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.285862", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.02217439003288746, "timestamp": "2025-09-15 03:19:42.287802", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:42.318224", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.014988348819315434, "timestamp": "2025-09-15 03:19:42.320526", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.350429", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.026887197047472, "timestamp": "2025-09-15 03:19:42.352465", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:42.383306", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.016281643882393837, "timestamp": "2025-09-15 03:19:42.406709", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.437336", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.006518169771879911, "timestamp": "2025-09-15 03:19:42.439424", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:42.469707", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.003812138456851244, "timestamp": "2025-09-15 03:19:42.471716", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.501641", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.004710691515356302, "timestamp": "2025-09-15 03:19:42.503901", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.534023", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.026245180517435074, "timestamp": "2025-09-15 03:19:42.558608", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.588519", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.011165949515998363, "timestamp": "2025-09-15 03:19:42.590814", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:42.620824", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.003821011632680893, "timestamp": "2025-09-15 03:19:42.623018", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.653068", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.017458414658904076, "timestamp": "2025-09-15 03:19:42.655132", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.685327", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.02229154482483864, "timestamp": "2025-09-15 03:19:42.708863", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.738929", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.009810912422835827, "timestamp": "2025-09-15 03:19:42.741257", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.770555", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.04138774797320366, "timestamp": "2025-09-15 03:19:42.772514", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.801904", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.014048370532691479, "timestamp": "2025-09-15 03:19:42.803777", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.833875", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.0071984389796853065, "timestamp": "2025-09-15 03:19:42.857493", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:42.887716", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.008396588265895844, "timestamp": "2025-09-15 03:19:42.889942", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:42.920444", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.014851844869554043, "timestamp": "2025-09-15 03:19:42.922534", "step": 798, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:43.629519", "step": 798, "epoch": 1 }, { "type": "pplx", "content": 104045215.20344174, "timestamp": "2025-09-15 03:19:43.631597", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.660045", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.005125641357153654, "timestamp": "2025-09-15 03:19:43.662088", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.691708", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.03359196335077286, "timestamp": "2025-09-15 03:19:43.715256", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.745226", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.024786178022623062, "timestamp": "2025-09-15 03:19:43.747145", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:43.777091", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.030102457851171494, "timestamp": "2025-09-15 03:19:43.779278", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.808965", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.024250861257314682, "timestamp": "2025-09-15 03:19:43.811257", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.841172", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.008365518413484097, "timestamp": "2025-09-15 03:19:43.864521", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.894479", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.024482980370521545, "timestamp": "2025-09-15 03:19:43.896537", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.926254", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.026492753997445107, "timestamp": "2025-09-15 03:19:43.928646", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:43.958387", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.01094807032495737, "timestamp": "2025-09-15 03:19:43.960359", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:43.990492", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.01790359988808632, "timestamp": "2025-09-15 03:19:44.013933", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.044146", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.00928981602191925, "timestamp": "2025-09-15 03:19:44.046286", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.076052", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.01696016825735569, "timestamp": "2025-09-15 03:19:44.078318", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.108250", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.020789949223399162, "timestamp": "2025-09-15 03:19:44.110252", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.140116", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.012049756944179535, "timestamp": "2025-09-15 03:19:44.163467", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.193188", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.02744273841381073, "timestamp": "2025-09-15 03:19:44.195207", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.224712", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.014353885315358639, "timestamp": "2025-09-15 03:19:44.226742", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.256987", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.008923429064452648, "timestamp": "2025-09-15 03:19:44.259241", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.288686", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.009295600466430187, "timestamp": "2025-09-15 03:19:44.312556", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.342475", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.025669749826192856, "timestamp": "2025-09-15 03:19:44.344563", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.374499", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.013954401016235352, "timestamp": "2025-09-15 03:19:44.376539", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.406901", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.007388160564005375, "timestamp": "2025-09-15 03:19:44.408833", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.439325", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.014094533398747444, "timestamp": "2025-09-15 03:19:44.462711", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.492759", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.012995717115700245, "timestamp": "2025-09-15 03:19:44.494912", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.524843", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.01578564941883087, "timestamp": "2025-09-15 03:19:44.526939", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.557018", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.012877593748271465, "timestamp": "2025-09-15 03:19:44.558916", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.590517", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.02812553383409977, "timestamp": "2025-09-15 03:19:44.614026", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.643995", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.009813624434173107, "timestamp": "2025-09-15 03:19:44.646241", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.675963", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.01687939278781414, "timestamp": "2025-09-15 03:19:44.678068", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.707979", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.02816028706729412, "timestamp": "2025-09-15 03:19:44.710193", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:44.740965", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.011854671873152256, "timestamp": "2025-09-15 03:19:44.764447", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.795127", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.014840801246464252, "timestamp": "2025-09-15 03:19:44.797196", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.827052", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.004077043384313583, "timestamp": "2025-09-15 03:19:44.829125", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.858933", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.005604589823633432, "timestamp": "2025-09-15 03:19:44.860942", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.890927", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.004893143195658922, "timestamp": "2025-09-15 03:19:44.914522", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:44.944679", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.03003629483282566, "timestamp": "2025-09-15 03:19:44.946780", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:44.977071", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.01299560721963644, "timestamp": "2025-09-15 03:19:44.979326", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:45.009667", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.0031098301988095045, "timestamp": "2025-09-15 03:19:45.011653", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.042133", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.006905579008162022, "timestamp": "2025-09-15 03:19:45.065417", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:45.097673", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.040472351014614105, "timestamp": "2025-09-15 03:19:45.100169", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.130115", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.02735285460948944, "timestamp": "2025-09-15 03:19:45.132218", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.162417", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.03405022248625755, "timestamp": "2025-09-15 03:19:45.164413", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:45.193969", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.01648932322859764, "timestamp": "2025-09-15 03:19:45.217348", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.247358", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.018406063318252563, "timestamp": "2025-09-15 03:19:45.249478", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.279399", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.0345698781311512, "timestamp": "2025-09-15 03:19:45.281422", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.310699", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.015852032229304314, "timestamp": "2025-09-15 03:19:45.313655", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.344231", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.01953796111047268, "timestamp": "2025-09-15 03:19:45.368112", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:45.398869", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.005201183725148439, "timestamp": "2025-09-15 03:19:45.401262", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:45.431133", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.0030453321523964405, "timestamp": "2025-09-15 03:19:45.433331", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.463266", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.025405941531062126, "timestamp": "2025-09-15 03:19:45.465257", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:45.494939", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.02325121872127056, "timestamp": "2025-09-15 03:19:45.518341", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:45.548577", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.0039110551588237286, "timestamp": "2025-09-15 03:19:45.550569", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.580032", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.0093283262103796, "timestamp": "2025-09-15 03:19:45.582138", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.612011", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.007319420110434294, "timestamp": "2025-09-15 03:19:45.613961", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:45.644654", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.007478214800357819, "timestamp": "2025-09-15 03:19:45.668145", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.697634", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.007805486675351858, "timestamp": "2025-09-15 03:19:45.699649", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:45.729480", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.035240091383457184, "timestamp": "2025-09-15 03:19:45.731466", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:45.761510", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.010297401808202267, "timestamp": "2025-09-15 03:19:45.763625", "step": 855, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:46.472603", "step": 855, "epoch": 1 }, { "type": "pplx", "content": 100037921.68841042, "timestamp": "2025-09-15 03:19:46.474452", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.502434", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.00798899494111538, "timestamp": "2025-09-15 03:19:46.525899", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.556083", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.012356921099126339, "timestamp": "2025-09-15 03:19:46.558117", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:46.588087", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.009803662076592445, "timestamp": "2025-09-15 03:19:46.590191", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.620069", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.011618572287261486, "timestamp": "2025-09-15 03:19:46.622059", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.651554", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.03089841827750206, "timestamp": "2025-09-15 03:19:46.675023", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.704709", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.014122366905212402, "timestamp": "2025-09-15 03:19:46.706682", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:46.737170", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.006204747129231691, "timestamp": "2025-09-15 03:19:46.739307", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.769294", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.011782601475715637, "timestamp": "2025-09-15 03:19:46.771364", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.801556", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.010657857172191143, "timestamp": "2025-09-15 03:19:46.825091", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.854926", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.012917198240756989, "timestamp": "2025-09-15 03:19:46.856991", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:46.886751", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.002522421535104513, "timestamp": "2025-09-15 03:19:46.888801", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.919956", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.002517345128580928, "timestamp": "2025-09-15 03:19:46.922000", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:46.952565", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.004488280508667231, "timestamp": "2025-09-15 03:19:46.975920", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.006076", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.04148333519697189, "timestamp": "2025-09-15 03:19:47.008298", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.038485", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.07123760879039764, "timestamp": "2025-09-15 03:19:47.040295", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.070705", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.016277441754937172, "timestamp": "2025-09-15 03:19:47.072732", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.103374", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.00033685853122733533, "timestamp": "2025-09-15 03:19:47.126806", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.157549", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.029803471639752388, "timestamp": "2025-09-15 03:19:47.159632", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.189237", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.020926347002387047, "timestamp": "2025-09-15 03:19:47.191261", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.220936", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.03717362508177757, "timestamp": "2025-09-15 03:19:47.222952", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.253178", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.05333937332034111, "timestamp": "2025-09-15 03:19:47.276570", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.307079", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.0026524479035288095, "timestamp": "2025-09-15 03:19:47.308993", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.338628", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.0010080545907840133, "timestamp": "2025-09-15 03:19:47.340905", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.371024", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.046601030975580215, "timestamp": "2025-09-15 03:19:47.373104", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:47.403307", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.058754634112119675, "timestamp": "2025-09-15 03:19:47.426954", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.457145", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.017497990280389786, "timestamp": "2025-09-15 03:19:47.459099", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.488796", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.029213156551122665, "timestamp": "2025-09-15 03:19:47.490708", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.520081", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.019578823819756508, "timestamp": "2025-09-15 03:19:47.522043", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.551862", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.007458627223968506, "timestamp": "2025-09-15 03:19:47.575402", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.605497", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.03434719145298004, "timestamp": "2025-09-15 03:19:47.607573", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.637221", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.018169419839978218, "timestamp": "2025-09-15 03:19:47.639256", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.669823", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.04940139502286911, "timestamp": "2025-09-15 03:19:47.671951", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.701865", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.05026103928685188, "timestamp": "2025-09-15 03:19:47.725309", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.755323", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.026359498500823975, "timestamp": "2025-09-15 03:19:47.757275", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.787431", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.0597798153758049, "timestamp": "2025-09-15 03:19:47.789332", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.818782", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.012453628703951836, "timestamp": "2025-09-15 03:19:47.820736", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.851572", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.02922959066927433, "timestamp": "2025-09-15 03:19:47.875077", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:47.905722", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.019007423892617226, "timestamp": "2025-09-15 03:19:47.907743", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.937952", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.05699565261602402, "timestamp": "2025-09-15 03:19:47.940014", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:47.970485", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.01930629089474678, "timestamp": "2025-09-15 03:19:47.972661", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.002718", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.02485084906220436, "timestamp": "2025-09-15 03:19:48.027890", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.057767", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.02043440006673336, "timestamp": "2025-09-15 03:19:48.059724", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.089632", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.018073951825499535, "timestamp": "2025-09-15 03:19:48.092145", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.123315", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.017345238476991653, "timestamp": "2025-09-15 03:19:48.125443", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.155262", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.01569359377026558, "timestamp": "2025-09-15 03:19:48.178793", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.208871", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.014010734856128693, "timestamp": "2025-09-15 03:19:48.210957", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.241119", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.008070044219493866, "timestamp": "2025-09-15 03:19:48.243230", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.273844", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.019519077613949776, "timestamp": "2025-09-15 03:19:48.275881", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.305848", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.025401020422577858, "timestamp": "2025-09-15 03:19:48.329904", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.360223", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.0037331220228224993, "timestamp": "2025-09-15 03:19:48.362199", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.391499", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.010357382707297802, "timestamp": "2025-09-15 03:19:48.393491", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.423588", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.035081468522548676, "timestamp": "2025-09-15 03:19:48.425849", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.456323", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.023906197398900986, "timestamp": "2025-09-15 03:19:48.479865", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:48.509964", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.033607613295316696, "timestamp": "2025-09-15 03:19:48.512927", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.542927", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.0026440354995429516, "timestamp": "2025-09-15 03:19:48.544963", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:48.574828", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.0184736680239439, "timestamp": "2025-09-15 03:19:48.577093", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:48.606736", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.0037520730402320623, "timestamp": "2025-09-15 03:19:48.630137", "step": 912, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:49.340710", "step": 912, "epoch": 1 }, { "type": "pplx", "content": 72598961.44010709, "timestamp": "2025-09-15 03:19:49.342492", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.371687", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.03960108011960983, "timestamp": "2025-09-15 03:19:49.374066", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:49.404380", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.017269397154450417, "timestamp": "2025-09-15 03:19:49.406342", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.436269", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.0040565552189946175, "timestamp": "2025-09-15 03:19:49.437955", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.468065", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.003966494929045439, "timestamp": "2025-09-15 03:19:49.491431", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:49.521051", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.034861061722040176, "timestamp": "2025-09-15 03:19:49.522784", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:49.571945", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.039139218628406525, "timestamp": "2025-09-15 03:19:49.573899", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.603605", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.017292311415076256, "timestamp": "2025-09-15 03:19:49.605430", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.634924", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.05987069010734558, "timestamp": "2025-09-15 03:19:49.658535", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.688090", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.024779552593827248, "timestamp": "2025-09-15 03:19:49.690092", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.720099", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.03946017473936081, "timestamp": "2025-09-15 03:19:49.722247", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.752125", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.01998963952064514, "timestamp": "2025-09-15 03:19:49.753990", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:49.783600", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.030949410051107407, "timestamp": "2025-09-15 03:19:49.806874", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.837311", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.017210932448506355, "timestamp": "2025-09-15 03:19:49.839370", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.869458", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.011270995251834393, "timestamp": "2025-09-15 03:19:49.871550", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.901089", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.025099992752075195, "timestamp": "2025-09-15 03:19:49.903307", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.933067", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.020727096125483513, "timestamp": "2025-09-15 03:19:49.956605", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:49.986087", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.01741086132824421, "timestamp": "2025-09-15 03:19:49.988451", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.017823", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.019103452563285828, "timestamp": "2025-09-15 03:19:50.020315", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.049761", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.019196193665266037, "timestamp": "2025-09-15 03:19:50.051441", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.081837", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.023661328479647636, "timestamp": "2025-09-15 03:19:50.105008", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.134777", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.024797087535262108, "timestamp": "2025-09-15 03:19:50.137051", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.167971", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.025742126628756523, "timestamp": "2025-09-15 03:19:50.169909", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.199762", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.017478061839938164, "timestamp": "2025-09-15 03:19:50.202235", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:50.232848", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.023372257128357887, "timestamp": "2025-09-15 03:19:50.256493", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.286609", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.01836288534104824, "timestamp": "2025-09-15 03:19:50.288514", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:50.318671", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.02380347065627575, "timestamp": "2025-09-15 03:19:50.320765", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.351214", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.025880884379148483, "timestamp": "2025-09-15 03:19:50.353295", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.383964", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.018349649384617805, "timestamp": "2025-09-15 03:19:50.407628", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.437113", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.026017997413873672, "timestamp": "2025-09-15 03:19:50.438917", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.469082", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.01880783401429653, "timestamp": "2025-09-15 03:19:50.470919", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:50.501096", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.015078485012054443, "timestamp": "2025-09-15 03:19:50.503273", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:50.532688", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.017336489632725716, "timestamp": "2025-09-15 03:19:50.555910", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.585760", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.02135271206498146, "timestamp": "2025-09-15 03:19:50.587553", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.617823", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.018634533509612083, "timestamp": "2025-09-15 03:19:50.619666", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.649299", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.01942361891269684, "timestamp": "2025-09-15 03:19:50.651036", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.680156", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.02213677205145359, "timestamp": "2025-09-15 03:19:50.703548", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.733476", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.017390018329024315, "timestamp": "2025-09-15 03:19:50.735496", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.765761", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.008217746391892433, "timestamp": "2025-09-15 03:19:50.768128", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.797893", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.032679129391908646, "timestamp": "2025-09-15 03:19:50.799998", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.829730", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.00899417418986559, "timestamp": "2025-09-15 03:19:50.853197", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.882525", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.01678101159632206, "timestamp": "2025-09-15 03:19:50.884330", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.913859", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.01344042457640171, "timestamp": "2025-09-15 03:19:50.915537", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:50.945118", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.013032278046011925, "timestamp": "2025-09-15 03:19:50.946942", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:50.976642", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.007103900425136089, "timestamp": "2025-09-15 03:19:50.999813", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.029536", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.01115063764154911, "timestamp": "2025-09-15 03:19:51.031414", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.060862", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.0056912945583462715, "timestamp": "2025-09-15 03:19:51.062490", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:51.092578", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.018493881449103355, "timestamp": "2025-09-15 03:19:51.094650", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.124662", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.021584318950772285, "timestamp": "2025-09-15 03:19:51.148141", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:51.178184", "step": 960, "epoch": 2 }, { "type": "loss", "content": 0.0019218011293560266, "timestamp": "2025-09-15 03:19:51.180035", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.210201", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.003553554881364107, "timestamp": "2025-09-15 03:19:51.212278", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.244332", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.003442298388108611, "timestamp": "2025-09-15 03:19:51.247387", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.277085", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.022211765870451927, "timestamp": "2025-09-15 03:19:51.300598", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.330267", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.009360029362142086, "timestamp": "2025-09-15 03:19:51.332114", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.361890", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.001736950478516519, "timestamp": "2025-09-15 03:19:51.363682", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:51.393849", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.0288834385573864, "timestamp": "2025-09-15 03:19:51.395951", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.425707", "step": 967, "epoch": 2 }, { "type": "loss", "content": 0.0033125595655292273, "timestamp": "2025-09-15 03:19:51.448812", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:51.477964", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.0421050563454628, "timestamp": "2025-09-15 03:19:51.479673", "step": 969, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:19:52.188633", "step": 969, "epoch": 2 }, { "type": "pplx", "content": 63605872.170415215, "timestamp": "2025-09-15 03:19:52.190783", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.219146", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.03343040868639946, "timestamp": "2025-09-15 03:19:52.221077", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:52.251560", "step": 970, "epoch": 2 }, { "type": "loss", "content": 0.03231266885995865, "timestamp": "2025-09-15 03:19:52.253617", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.283909", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.01777302660048008, "timestamp": "2025-09-15 03:19:52.307261", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.337031", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.027679353952407837, "timestamp": "2025-09-15 03:19:52.338850", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.368361", "step": 973, "epoch": 2 }, { "type": "loss", "content": 0.016544125974178314, "timestamp": "2025-09-15 03:19:52.370165", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.399712", "step": 974, "epoch": 2 }, { "type": "loss", "content": 0.01474784966558218, "timestamp": "2025-09-15 03:19:52.401506", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.431277", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.021217798814177513, "timestamp": "2025-09-15 03:19:52.454490", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.483759", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.029132353141903877, "timestamp": "2025-09-15 03:19:52.485573", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.515487", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.010967997834086418, "timestamp": "2025-09-15 03:19:52.517194", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.547227", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.0184993464499712, "timestamp": "2025-09-15 03:19:52.549416", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.579279", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.0035073976032435894, "timestamp": "2025-09-15 03:19:52.602463", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.632206", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.004275471903383732, "timestamp": "2025-09-15 03:19:52.634135", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.663668", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.004823813680559397, "timestamp": "2025-09-15 03:19:52.666112", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.696134", "step": 982, "epoch": 2 }, { "type": "loss", "content": 0.03375326469540596, "timestamp": "2025-09-15 03:19:52.698203", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.728385", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.003615120192989707, "timestamp": "2025-09-15 03:19:52.751951", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.781703", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.023827284574508667, "timestamp": "2025-09-15 03:19:52.783702", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.815054", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.02614396996796131, "timestamp": "2025-09-15 03:19:52.817057", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.847061", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.02753402665257454, "timestamp": "2025-09-15 03:19:52.849014", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.878695", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.00410444475710392, "timestamp": "2025-09-15 03:19:52.901828", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.931324", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.03475097566843033, "timestamp": "2025-09-15 03:19:52.933176", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:52.963118", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.007826111279428005, "timestamp": "2025-09-15 03:19:52.965301", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:52.996524", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.0463503859937191, "timestamp": "2025-09-15 03:19:52.998410", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.027936", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.02141661010682583, "timestamp": "2025-09-15 03:19:53.051030", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:19:53.080958", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.021750137209892273, "timestamp": "2025-09-15 03:19:53.083252", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.113221", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.01930098421871662, "timestamp": "2025-09-15 03:19:53.115183", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.145170", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.0189370010048151, "timestamp": "2025-09-15 03:19:53.147364", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.177336", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.019197093322873116, "timestamp": "2025-09-15 03:19:53.200739", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:19:53.231394", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.020200148224830627, "timestamp": "2025-09-15 03:19:53.233327", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.262987", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.022475184872746468, "timestamp": "2025-09-15 03:19:53.265309", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.294892", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.024552693590521812, "timestamp": "2025-09-15 03:19:53.296946", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:19:53.326379", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.006172493565827608, "timestamp": "2025-09-15 03:19:53.349406", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-15 03:20:00.254972", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.288498", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.02578757330775261, "timestamp": "2025-09-15 03:20:00.290858", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.322099", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.021511312574148178, "timestamp": "2025-09-15 03:20:00.324165", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.354323", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 0.02874942310154438, "timestamp": "2025-09-15 03:20:00.356530", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.386747", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.013618089258670807, "timestamp": "2025-09-15 03:20:00.410700", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.441085", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.01972813531756401, "timestamp": "2025-09-15 03:20:00.443308", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.473291", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.027795910835266113, "timestamp": "2025-09-15 03:20:00.475389", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:00.506108", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.021912166848778725, "timestamp": "2025-09-15 03:20:00.508306", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.538933", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.03454792872071266, "timestamp": "2025-09-15 03:20:00.562504", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:00.593899", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.02996695600450039, "timestamp": "2025-09-15 03:20:00.595860", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.625973", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.018901299685239792, "timestamp": "2025-09-15 03:20:00.628067", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.657906", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.009113982319831848, "timestamp": "2025-09-15 03:20:00.659997", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.691078", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.030469568446278572, "timestamp": "2025-09-15 03:20:00.714502", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.744564", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.046722956001758575, "timestamp": "2025-09-15 03:20:00.746632", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.776566", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.019341707229614258, "timestamp": "2025-09-15 03:20:00.778404", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.807850", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.01661626063287258, "timestamp": "2025-09-15 03:20:00.810043", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.840688", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.008797021582722664, "timestamp": "2025-09-15 03:20:00.864255", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.894247", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.02718176133930683, "timestamp": "2025-09-15 03:20:00.896243", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.925650", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.020584603771567345, "timestamp": "2025-09-15 03:20:00.927833", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.957848", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 0.011983073316514492, "timestamp": "2025-09-15 03:20:00.960083", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:00.990180", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.017778072506189346, "timestamp": "2025-09-15 03:20:01.013787", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:01.043857", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.025459052994847298, "timestamp": "2025-09-15 03:20:01.045827", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:01.075973", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.035514552146196365, "timestamp": "2025-09-15 03:20:01.077889", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:01.108193", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.0033756468910723925, "timestamp": "2025-09-15 03:20:01.110262", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:01.140585", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.008607217110693455, "timestamp": "2025-09-15 03:20:01.164036", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:01.194052", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.03825444355607033, "timestamp": "2025-09-15 03:20:01.195998", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:01.226391", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.04544388875365257, "timestamp": "2025-09-15 03:20:01.228431", "step": 1026, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:01.951985", "step": 1026, "epoch": 2 }, { "type": "pplx", "content": 59711159.30161082, "timestamp": "2025-09-15 03:20:01.954028", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:01.982504", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.05692798271775246, "timestamp": "2025-09-15 03:20:01.984823", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.015273", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.003602404845878482, "timestamp": "2025-09-15 03:20:02.038767", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.069047", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.013803867623209953, "timestamp": "2025-09-15 03:20:02.071440", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.101812", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.012036988511681557, "timestamp": "2025-09-15 03:20:02.104059", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.134478", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.011974446475505829, "timestamp": "2025-09-15 03:20:02.136849", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.167057", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.0036804601550102234, "timestamp": "2025-09-15 03:20:02.190434", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:02.220407", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.01505777146667242, "timestamp": "2025-09-15 03:20:02.222542", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.252522", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.017127230763435364, "timestamp": "2025-09-15 03:20:02.254610", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.284586", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.015601657330989838, "timestamp": "2025-09-15 03:20:02.286965", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.317206", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.006081894971430302, "timestamp": "2025-09-15 03:20:02.340566", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.370407", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.00815503392368555, "timestamp": "2025-09-15 03:20:02.372451", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.403853", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.042893558740615845, "timestamp": "2025-09-15 03:20:02.405886", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:02.437135", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.017841657623648643, "timestamp": "2025-09-15 03:20:02.439162", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:02.469180", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.024145500734448433, "timestamp": "2025-09-15 03:20:02.492792", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.523327", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.014505532570183277, "timestamp": "2025-09-15 03:20:02.525445", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.554955", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.009613566100597382, "timestamp": "2025-09-15 03:20:02.557049", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.587058", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.04523482546210289, "timestamp": "2025-09-15 03:20:02.590149", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:02.619880", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.01229308545589447, "timestamp": "2025-09-15 03:20:02.643746", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.674542", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.006579564418643713, "timestamp": "2025-09-15 03:20:02.676729", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.723819", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.011694149114191532, "timestamp": "2025-09-15 03:20:02.726307", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.756165", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.03836764767765999, "timestamp": "2025-09-15 03:20:02.758285", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.787816", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.01350962370634079, "timestamp": "2025-09-15 03:20:02.811173", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.842403", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 0.008706008084118366, "timestamp": "2025-09-15 03:20:02.844692", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:02.874765", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.01117317657917738, "timestamp": "2025-09-15 03:20:02.877109", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.907109", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.026028618216514587, "timestamp": "2025-09-15 03:20:02.909387", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:02.939090", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.011078967712819576, "timestamp": "2025-09-15 03:20:02.962999", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:02.992792", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.0195362139493227, "timestamp": "2025-09-15 03:20:02.995803", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:03.025872", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.027163058519363403, "timestamp": "2025-09-15 03:20:03.027935", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.057539", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.029660746455192566, "timestamp": "2025-09-15 03:20:03.059858", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.089819", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.03723704442381859, "timestamp": "2025-09-15 03:20:03.113177", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.142899", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.019413789734244347, "timestamp": "2025-09-15 03:20:03.145006", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.175594", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.023310324177145958, "timestamp": "2025-09-15 03:20:03.177732", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.208151", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.03065226413309574, "timestamp": "2025-09-15 03:20:03.210414", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.240256", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.014415273442864418, "timestamp": "2025-09-15 03:20:03.263736", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.294766", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.014950952492654324, "timestamp": "2025-09-15 03:20:03.297113", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.329486", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.01662154495716095, "timestamp": "2025-09-15 03:20:03.331787", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.363111", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.0073923333548009396, "timestamp": "2025-09-15 03:20:03.365302", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.396160", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.010172292590141296, "timestamp": "2025-09-15 03:20:03.419487", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.450602", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.03431609272956848, "timestamp": "2025-09-15 03:20:03.452623", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.483055", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.02533714286983013, "timestamp": "2025-09-15 03:20:03.485153", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.515326", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 0.0259034913033247, "timestamp": "2025-09-15 03:20:03.517558", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.547202", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 0.011607496067881584, "timestamp": "2025-09-15 03:20:03.570776", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.600671", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 0.006037650164216757, "timestamp": "2025-09-15 03:20:03.603619", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:03.633847", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.014436027966439724, "timestamp": "2025-09-15 03:20:03.635891", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:03.665951", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 0.020872337743639946, "timestamp": "2025-09-15 03:20:03.667991", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.698968", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.024816256016492844, "timestamp": "2025-09-15 03:20:03.722331", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.753776", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.010879909619688988, "timestamp": "2025-09-15 03:20:03.755802", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.786671", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.04255342856049538, "timestamp": "2025-09-15 03:20:03.788986", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.818928", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.04912864789366722, "timestamp": "2025-09-15 03:20:03.821220", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:03.850401", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.046083271503448486, "timestamp": "2025-09-15 03:20:03.873741", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.903722", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.02562423050403595, "timestamp": "2025-09-15 03:20:03.912649", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.947024", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.023624440655112267, "timestamp": "2025-09-15 03:20:03.949139", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:03.978694", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.017809877172112465, "timestamp": "2025-09-15 03:20:03.980881", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.010270", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 0.018705327063798904, "timestamp": "2025-09-15 03:20:04.033828", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:04.063874", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.017726384103298187, "timestamp": "2025-09-15 03:20:04.066214", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.096536", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.030536355450749397, "timestamp": "2025-09-15 03:20:04.098704", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:04.128905", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.017621448263525963, "timestamp": "2025-09-15 03:20:04.131006", "step": 1083, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:04.839692", "step": 1083, "epoch": 2 }, { "type": "pplx", "content": 55375998.91955021, "timestamp": "2025-09-15 03:20:04.842010", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.870342", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.015300673432648182, "timestamp": "2025-09-15 03:20:04.893914", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.923917", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.006749466527253389, "timestamp": "2025-09-15 03:20:04.925994", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.955631", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 0.01821901462972164, "timestamp": "2025-09-15 03:20:04.957923", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:04.988137", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.028511449694633484, "timestamp": "2025-09-15 03:20:04.990295", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.019956", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.00882384367287159, "timestamp": "2025-09-15 03:20:05.043536", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:05.073986", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.028568703681230545, "timestamp": "2025-09-15 03:20:05.076075", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.105716", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.01979498565196991, "timestamp": "2025-09-15 03:20:05.107895", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.138185", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.0432886965572834, "timestamp": "2025-09-15 03:20:05.140657", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.170573", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 0.007654269225895405, "timestamp": "2025-09-15 03:20:05.194149", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:05.224357", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.01625342108309269, "timestamp": "2025-09-15 03:20:05.226451", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.256212", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.011958093382418156, "timestamp": "2025-09-15 03:20:05.258352", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.290265", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.010880166664719582, "timestamp": "2025-09-15 03:20:05.292433", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.321917", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.006026288028806448, "timestamp": "2025-09-15 03:20:05.346549", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.378254", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.01763133518397808, "timestamp": "2025-09-15 03:20:05.380297", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.409340", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.021360328420996666, "timestamp": "2025-09-15 03:20:05.411697", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.441489", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.020162245258688927, "timestamp": "2025-09-15 03:20:05.443551", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.472964", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.026214588433504105, "timestamp": "2025-09-15 03:20:05.496728", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.527459", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.027630941942334175, "timestamp": "2025-09-15 03:20:05.529621", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:05.559110", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.01204682793468237, "timestamp": "2025-09-15 03:20:05.561272", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.595782", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.021185899153351784, "timestamp": "2025-09-15 03:20:05.598058", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.631793", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.016678977757692337, "timestamp": "2025-09-15 03:20:05.655330", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.684801", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.01232249103486538, "timestamp": "2025-09-15 03:20:05.687096", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.716678", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.010624246671795845, "timestamp": "2025-09-15 03:20:05.719051", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.748786", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.02412816509604454, "timestamp": "2025-09-15 03:20:05.750902", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.780579", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.0221096184104681, "timestamp": "2025-09-15 03:20:05.804000", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:05.834426", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.010182474739849567, "timestamp": "2025-09-15 03:20:05.836694", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.867243", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.017153101041913033, "timestamp": "2025-09-15 03:20:05.869479", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.899424", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.01956060342490673, "timestamp": "2025-09-15 03:20:05.901668", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:05.931341", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.02774369716644287, "timestamp": "2025-09-15 03:20:05.955226", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:05.985122", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.021901097148656845, "timestamp": "2025-09-15 03:20:05.987404", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:06.017078", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.012037809006869793, "timestamp": "2025-09-15 03:20:06.019277", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:06.049254", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.021336954087018967, "timestamp": "2025-09-15 03:20:06.051520", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:06.081661", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.017799798399209976, "timestamp": "2025-09-15 03:20:06.105196", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.134710", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.02415151707828045, "timestamp": "2025-09-15 03:20:06.136910", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:06.166921", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.01389431394636631, "timestamp": "2025-09-15 03:20:06.169292", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.198838", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.01987137459218502, "timestamp": "2025-09-15 03:20:06.200839", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.230517", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.028128299862146378, "timestamp": "2025-09-15 03:20:06.254036", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:06.283330", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.0228290855884552, "timestamp": "2025-09-15 03:20:06.285598", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:06.315935", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.011218971572816372, "timestamp": "2025-09-15 03:20:06.318052", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.348089", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.014887683093547821, "timestamp": "2025-09-15 03:20:06.350978", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.380778", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.016286566853523254, "timestamp": "2025-09-15 03:20:06.404378", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.434178", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.008015617728233337, "timestamp": "2025-09-15 03:20:06.436345", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.467907", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.023351913318037987, "timestamp": "2025-09-15 03:20:06.470131", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.499936", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 0.02599175088107586, "timestamp": "2025-09-15 03:20:06.501983", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.532023", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.0075608291663229465, "timestamp": "2025-09-15 03:20:06.555530", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.586133", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.042395059019327164, "timestamp": "2025-09-15 03:20:06.588495", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.619710", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.027750244364142418, "timestamp": "2025-09-15 03:20:06.621873", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.652013", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.014860682189464569, "timestamp": "2025-09-15 03:20:06.654137", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.683968", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.01270745974034071, "timestamp": "2025-09-15 03:20:06.707269", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.737889", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.02112249843776226, "timestamp": "2025-09-15 03:20:06.739991", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:06.771668", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.0172148235142231, "timestamp": "2025-09-15 03:20:06.773822", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:06.803888", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.024268055334687233, "timestamp": "2025-09-15 03:20:06.806230", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.836326", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.01310635544359684, "timestamp": "2025-09-15 03:20:06.860080", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.889990", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.008171673864126205, "timestamp": "2025-09-15 03:20:06.892003", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.921888", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.017726849764585495, "timestamp": "2025-09-15 03:20:06.923979", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.953873", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.011989779770374298, "timestamp": "2025-09-15 03:20:06.955940", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:06.986103", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.020538654178380966, "timestamp": "2025-09-15 03:20:07.009578", "step": 1140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:07.720501", "step": 1140, "epoch": 2 }, { "type": "pplx", "content": 58533645.06906351, "timestamp": "2025-09-15 03:20:07.722844", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:07.751397", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.027201279997825623, "timestamp": "2025-09-15 03:20:07.754517", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:07.785506", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.010129685513675213, "timestamp": "2025-09-15 03:20:07.787641", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:07.817709", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.01725381426513195, "timestamp": "2025-09-15 03:20:07.819784", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:07.850024", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.03179285675287247, "timestamp": "2025-09-15 03:20:07.873600", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:07.906912", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.005451020319014788, "timestamp": "2025-09-15 03:20:07.909009", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:07.939031", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.019774705171585083, "timestamp": "2025-09-15 03:20:07.941412", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:07.971686", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.0117212338373065, "timestamp": "2025-09-15 03:20:07.973952", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.003950", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.012994612567126751, "timestamp": "2025-09-15 03:20:08.027561", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.057386", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.02703903801739216, "timestamp": "2025-09-15 03:20:08.060411", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:08.090655", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.014843891374766827, "timestamp": "2025-09-15 03:20:08.099855", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.132891", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.03394562751054764, "timestamp": "2025-09-15 03:20:08.135316", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.165110", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.01329898927360773, "timestamp": "2025-09-15 03:20:08.188693", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.218582", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.011320680379867554, "timestamp": "2025-09-15 03:20:08.220791", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:08.250814", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.011235682293772697, "timestamp": "2025-09-15 03:20:08.253892", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.284099", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 0.021955570206046104, "timestamp": "2025-09-15 03:20:08.286123", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.322440", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.02674642577767372, "timestamp": "2025-09-15 03:20:08.346116", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.376096", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 0.01354469545185566, "timestamp": "2025-09-15 03:20:08.378318", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:08.425883", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 0.00968851987272501, "timestamp": "2025-09-15 03:20:08.428236", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.458720", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.013550050556659698, "timestamp": "2025-09-15 03:20:08.461019", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.491579", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.00861669797450304, "timestamp": "2025-09-15 03:20:08.515133", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.545114", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.007723599206656218, "timestamp": "2025-09-15 03:20:08.547419", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.577516", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.026734083890914917, "timestamp": "2025-09-15 03:20:08.582763", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.613178", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.01663447730243206, "timestamp": "2025-09-15 03:20:08.615406", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.645653", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.02458677813410759, "timestamp": "2025-09-15 03:20:08.669304", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.698900", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.01422831416130066, "timestamp": "2025-09-15 03:20:08.702184", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.732012", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.018267182633280754, "timestamp": "2025-09-15 03:20:08.734407", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.764390", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.010763383470475674, "timestamp": "2025-09-15 03:20:08.768328", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.800328", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.04157629609107971, "timestamp": "2025-09-15 03:20:08.823943", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:08.853756", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.015311472117900848, "timestamp": "2025-09-15 03:20:08.856072", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.886614", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.008150981739163399, "timestamp": "2025-09-15 03:20:08.888892", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.920412", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.014152586460113525, "timestamp": "2025-09-15 03:20:08.922847", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:08.953027", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 0.007704091724008322, "timestamp": "2025-09-15 03:20:08.976592", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:09.013147", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.015262416563928127, "timestamp": "2025-09-15 03:20:09.015163", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.045465", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.012681369669735432, "timestamp": "2025-09-15 03:20:09.047517", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.077389", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.020055728033185005, "timestamp": "2025-09-15 03:20:09.079517", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.109478", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.010338046588003635, "timestamp": "2025-09-15 03:20:09.132776", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:09.164810", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.026065126061439514, "timestamp": "2025-09-15 03:20:09.172457", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.212075", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.014396383427083492, "timestamp": "2025-09-15 03:20:09.222229", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.255510", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.027096567675471306, "timestamp": "2025-09-15 03:20:09.259540", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.290560", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.015955336391925812, "timestamp": "2025-09-15 03:20:09.314189", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.344417", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.030931200832128525, "timestamp": "2025-09-15 03:20:09.346574", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.376598", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.031671468168497086, "timestamp": "2025-09-15 03:20:09.378877", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.409383", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.015935173258185387, "timestamp": "2025-09-15 03:20:09.411488", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:09.441534", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.007875466719269753, "timestamp": "2025-09-15 03:20:09.465515", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.495876", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.021102117374539375, "timestamp": "2025-09-15 03:20:09.498337", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.529458", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.015338304452598095, "timestamp": "2025-09-15 03:20:09.532557", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.566516", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.0060651483945548534, "timestamp": "2025-09-15 03:20:09.568660", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.598459", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.009633776731789112, "timestamp": "2025-09-15 03:20:09.623191", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.653423", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.014106557704508305, "timestamp": "2025-09-15 03:20:09.657651", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.688411", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.027194740250706673, "timestamp": "2025-09-15 03:20:09.691131", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:09.723940", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.012174146249890327, "timestamp": "2025-09-15 03:20:09.726496", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.757722", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 0.01647491380572319, "timestamp": "2025-09-15 03:20:09.781245", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:09.812233", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.015903526917099953, "timestamp": "2025-09-15 03:20:09.814379", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.845418", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 0.0020331903360784054, "timestamp": "2025-09-15 03:20:09.847565", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:09.877826", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.044761087745428085, "timestamp": "2025-09-15 03:20:09.879981", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:09.912771", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.012461199425160885, "timestamp": "2025-09-15 03:20:09.936396", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:09.966526", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 0.05127660185098648, "timestamp": "2025-09-15 03:20:09.968980", "step": 1197, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:10.683931", "step": 1197, "epoch": 2 }, { "type": "pplx", "content": 67346794.62348509, "timestamp": "2025-09-15 03:20:10.686006", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:10.715067", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.006219768431037664, "timestamp": "2025-09-15 03:20:10.717533", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:10.747307", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 0.012562769465148449, "timestamp": "2025-09-15 03:20:10.749519", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:10.779485", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.019395234063267708, "timestamp": "2025-09-15 03:20:10.803411", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:10.834066", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.046573035418987274, "timestamp": "2025-09-15 03:20:10.836131", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:10.866619", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.0038583676796406507, "timestamp": "2025-09-15 03:20:10.868969", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:10.899690", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.002558397827669978, "timestamp": "2025-09-15 03:20:10.902205", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:10.933138", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.005045489873737097, "timestamp": "2025-09-15 03:20:10.956734", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:10.989591", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 0.04085572436451912, "timestamp": "2025-09-15 03:20:10.991675", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.021099", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.03398078307509422, "timestamp": "2025-09-15 03:20:11.023213", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.052879", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.0036212815903127193, "timestamp": "2025-09-15 03:20:11.055072", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.085326", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.015824228525161743, "timestamp": "2025-09-15 03:20:11.109081", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:11.139648", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.014186462387442589, "timestamp": "2025-09-15 03:20:11.141703", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.171602", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.016707351431250572, "timestamp": "2025-09-15 03:20:11.173901", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.203784", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.04144059866666794, "timestamp": "2025-09-15 03:20:11.206167", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.235944", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.013599964790046215, "timestamp": "2025-09-15 03:20:11.259705", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:11.290162", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.007864146493375301, "timestamp": "2025-09-15 03:20:11.292299", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.321995", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.009913211688399315, "timestamp": "2025-09-15 03:20:11.324085", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.354466", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 0.017688969150185585, "timestamp": "2025-09-15 03:20:11.356650", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:11.386482", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.023656124249100685, "timestamp": "2025-09-15 03:20:11.410121", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:11.440142", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 0.011671700514853, "timestamp": "2025-09-15 03:20:11.442526", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.472326", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.006454241927713156, "timestamp": "2025-09-15 03:20:11.474413", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:11.504569", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.007134907878935337, "timestamp": "2025-09-15 03:20:11.506877", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.536463", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.00786716677248478, "timestamp": "2025-09-15 03:20:11.560023", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.589852", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 0.011776251718401909, "timestamp": "2025-09-15 03:20:11.592116", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.621964", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.007592611480504274, "timestamp": "2025-09-15 03:20:11.624201", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.654888", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.008074778132140636, "timestamp": "2025-09-15 03:20:11.657461", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.687227", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.009150748141109943, "timestamp": "2025-09-15 03:20:11.710887", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.741130", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.014765610918402672, "timestamp": "2025-09-15 03:20:11.743485", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.773640", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.003772977739572525, "timestamp": "2025-09-15 03:20:11.776200", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.806624", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.009069901891052723, "timestamp": "2025-09-15 03:20:11.809403", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.839226", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.015366188250482082, "timestamp": "2025-09-15 03:20:11.862951", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.892889", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.00551746878772974, "timestamp": "2025-09-15 03:20:11.895003", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.924579", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.032362472265958786, "timestamp": "2025-09-15 03:20:11.926748", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.958080", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 0.008966001681983471, "timestamp": "2025-09-15 03:20:11.960259", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:11.989742", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.020234253257513046, "timestamp": "2025-09-15 03:20:12.013381", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.043086", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.0029388617258518934, "timestamp": "2025-09-15 03:20:12.045211", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.074601", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.008875792846083641, "timestamp": "2025-09-15 03:20:12.076710", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:12.107652", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.018813664093613625, "timestamp": "2025-09-15 03:20:12.111082", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.141094", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 0.010143798775970936, "timestamp": "2025-09-15 03:20:12.164663", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:12.194883", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.0108027970418334, "timestamp": "2025-09-15 03:20:12.197020", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:12.227585", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.013201083056628704, "timestamp": "2025-09-15 03:20:12.229748", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.260484", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.02639073133468628, "timestamp": "2025-09-15 03:20:12.262578", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.293155", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.01808268018066883, "timestamp": "2025-09-15 03:20:12.316858", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:12.347508", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.006471390835940838, "timestamp": "2025-09-15 03:20:12.349692", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:12.380560", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.018432533368468285, "timestamp": "2025-09-15 03:20:12.382822", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.414986", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.022524258121848106, "timestamp": "2025-09-15 03:20:12.417103", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.447432", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.024223115295171738, "timestamp": "2025-09-15 03:20:12.471119", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:12.501793", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.026791978627443314, "timestamp": "2025-09-15 03:20:12.503876", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:12.534728", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.02190900780260563, "timestamp": "2025-09-15 03:20:12.536895", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.567362", "step": 1246, "epoch": 2 }, { "type": "loss", "content": 0.024278011173009872, "timestamp": "2025-09-15 03:20:12.569589", "step": 1247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:12.599841", "step": 1247, "epoch": 2 }, { "type": "loss", "content": 0.010498437099158764, "timestamp": "2025-09-15 03:20:12.623352", "step": 1248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.653854", "step": 1248, "epoch": 2 }, { "type": "loss", "content": 0.027558788657188416, "timestamp": "2025-09-15 03:20:12.656104", "step": 1249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:12.686587", "step": 1249, "epoch": 2 }, { "type": "loss", "content": 0.029875023290514946, "timestamp": "2025-09-15 03:20:12.688814", "step": 1250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.719071", "step": 1250, "epoch": 2 }, { "type": "loss", "content": 0.020839480683207512, "timestamp": "2025-09-15 03:20:12.721354", "step": 1251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:12.751391", "step": 1251, "epoch": 2 }, { "type": "loss", "content": 0.009385906159877777, "timestamp": "2025-09-15 03:20:12.775538", "step": 1252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:12.806477", "step": 1252, "epoch": 2 }, { "type": "loss", "content": 0.002834519138559699, "timestamp": "2025-09-15 03:20:12.808704", "step": 1253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:12.839544", "step": 1253, "epoch": 2 }, { "type": "loss", "content": 0.010481811128556728, "timestamp": "2025-09-15 03:20:12.841808", "step": 1254, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:13.560081", "step": 1254, "epoch": 2 }, { "type": "pplx", "content": 69112101.07041304, "timestamp": "2025-09-15 03:20:13.562009", "step": 1254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:13.591862", "step": 1254, "epoch": 2 }, { "type": "loss", "content": 0.007346840109676123, "timestamp": "2025-09-15 03:20:13.594007", "step": 1255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:13.624216", "step": 1255, "epoch": 2 }, { "type": "loss", "content": 0.02451970987021923, "timestamp": "2025-09-15 03:20:13.647861", "step": 1256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:13.678154", "step": 1256, "epoch": 2 }, { "type": "loss", "content": 0.016992518678307533, "timestamp": "2025-09-15 03:20:13.680209", "step": 1257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:13.710544", "step": 1257, "epoch": 2 }, { "type": "loss", "content": 0.023409055545926094, "timestamp": "2025-09-15 03:20:13.712656", "step": 1258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:13.742663", "step": 1258, "epoch": 2 }, { "type": "loss", "content": 0.008774764835834503, "timestamp": "2025-09-15 03:20:13.745003", "step": 1259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:13.775108", "step": 1259, "epoch": 2 }, { "type": "loss", "content": 0.0132039999589324, "timestamp": "2025-09-15 03:20:13.798607", "step": 1260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:13.829021", "step": 1260, "epoch": 2 }, { "type": "loss", "content": 0.05545142665505409, "timestamp": "2025-09-15 03:20:13.831239", "step": 1261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:13.861887", "step": 1261, "epoch": 2 }, { "type": "loss", "content": 0.013375957496464252, "timestamp": "2025-09-15 03:20:13.863961", "step": 1262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:13.894159", "step": 1262, "epoch": 2 }, { "type": "loss", "content": 0.020845327526330948, "timestamp": "2025-09-15 03:20:13.896429", "step": 1263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:13.927009", "step": 1263, "epoch": 2 }, { "type": "loss", "content": 0.01673973724246025, "timestamp": "2025-09-15 03:20:13.950498", "step": 1264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:13.980407", "step": 1264, "epoch": 2 }, { "type": "loss", "content": 0.008590168319642544, "timestamp": "2025-09-15 03:20:13.982498", "step": 1265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.013816", "step": 1265, "epoch": 2 }, { "type": "loss", "content": 0.013757781125605106, "timestamp": "2025-09-15 03:20:14.016299", "step": 1266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.046386", "step": 1266, "epoch": 2 }, { "type": "loss", "content": 0.020589904859662056, "timestamp": "2025-09-15 03:20:14.048828", "step": 1267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.078507", "step": 1267, "epoch": 2 }, { "type": "loss", "content": 0.02532745711505413, "timestamp": "2025-09-15 03:20:14.102254", "step": 1268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.131792", "step": 1268, "epoch": 2 }, { "type": "loss", "content": 0.015998464077711105, "timestamp": "2025-09-15 03:20:14.133763", "step": 1269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.164978", "step": 1269, "epoch": 2 }, { "type": "loss", "content": 0.02568703703582287, "timestamp": "2025-09-15 03:20:14.167046", "step": 1270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.196717", "step": 1270, "epoch": 2 }, { "type": "loss", "content": 0.006882861256599426, "timestamp": "2025-09-15 03:20:14.198870", "step": 1271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.228951", "step": 1271, "epoch": 2 }, { "type": "loss", "content": 0.019829755648970604, "timestamp": "2025-09-15 03:20:14.252841", "step": 1272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.282880", "step": 1272, "epoch": 2 }, { "type": "loss", "content": 0.009075929410755634, "timestamp": "2025-09-15 03:20:14.284964", "step": 1273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.315013", "step": 1273, "epoch": 2 }, { "type": "loss", "content": 0.021153524518013, "timestamp": "2025-09-15 03:20:14.317173", "step": 1274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.347532", "step": 1274, "epoch": 2 }, { "type": "loss", "content": 0.00710141658782959, "timestamp": "2025-09-15 03:20:14.349900", "step": 1275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.379628", "step": 1275, "epoch": 2 }, { "type": "loss", "content": 0.004452737048268318, "timestamp": "2025-09-15 03:20:14.403116", "step": 1276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:14.432916", "step": 1276, "epoch": 2 }, { "type": "loss", "content": 0.017650265246629715, "timestamp": "2025-09-15 03:20:14.435273", "step": 1277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:14.465905", "step": 1277, "epoch": 2 }, { "type": "loss", "content": 0.011342850513756275, "timestamp": "2025-09-15 03:20:14.468067", "step": 1278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.498281", "step": 1278, "epoch": 2 }, { "type": "loss", "content": 0.0378478541970253, "timestamp": "2025-09-15 03:20:14.500405", "step": 1279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.530022", "step": 1279, "epoch": 2 }, { "type": "loss", "content": 0.02093484438955784, "timestamp": "2025-09-15 03:20:14.553779", "step": 1280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.584569", "step": 1280, "epoch": 2 }, { "type": "loss", "content": 0.008996201679110527, "timestamp": "2025-09-15 03:20:14.586753", "step": 1281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.618026", "step": 1281, "epoch": 2 }, { "type": "loss", "content": 0.016198158264160156, "timestamp": "2025-09-15 03:20:14.620304", "step": 1282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.650828", "step": 1282, "epoch": 2 }, { "type": "loss", "content": 0.006912777666002512, "timestamp": "2025-09-15 03:20:14.652885", "step": 1283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:14.684063", "step": 1283, "epoch": 2 }, { "type": "loss", "content": 0.006836120970547199, "timestamp": "2025-09-15 03:20:14.707705", "step": 1284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.738329", "step": 1284, "epoch": 2 }, { "type": "loss", "content": 0.030089562758803368, "timestamp": "2025-09-15 03:20:14.740411", "step": 1285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.770146", "step": 1285, "epoch": 2 }, { "type": "loss", "content": 0.005958269815891981, "timestamp": "2025-09-15 03:20:14.772294", "step": 1286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.802219", "step": 1286, "epoch": 2 }, { "type": "loss", "content": 0.00943565834313631, "timestamp": "2025-09-15 03:20:14.804382", "step": 1287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.833953", "step": 1287, "epoch": 2 }, { "type": "loss", "content": 0.004659402649849653, "timestamp": "2025-09-15 03:20:14.857443", "step": 1288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.887074", "step": 1288, "epoch": 2 }, { "type": "loss", "content": 0.029415473341941833, "timestamp": "2025-09-15 03:20:14.889341", "step": 1289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:14.919235", "step": 1289, "epoch": 2 }, { "type": "loss", "content": 0.010652023367583752, "timestamp": "2025-09-15 03:20:14.921368", "step": 1290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.951211", "step": 1290, "epoch": 2 }, { "type": "loss", "content": 0.017675837501883507, "timestamp": "2025-09-15 03:20:14.953324", "step": 1291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:14.982654", "step": 1291, "epoch": 2 }, { "type": "loss", "content": 0.018566444516181946, "timestamp": "2025-09-15 03:20:15.006423", "step": 1292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.036542", "step": 1292, "epoch": 2 }, { "type": "loss", "content": 0.004875097889453173, "timestamp": "2025-09-15 03:20:15.038698", "step": 1293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.068324", "step": 1293, "epoch": 2 }, { "type": "loss", "content": 0.004339300561696291, "timestamp": "2025-09-15 03:20:15.070425", "step": 1294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.101645", "step": 1294, "epoch": 2 }, { "type": "loss", "content": 0.0056786248460412025, "timestamp": "2025-09-15 03:20:15.103794", "step": 1295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.134030", "step": 1295, "epoch": 2 }, { "type": "loss", "content": 0.023560095578432083, "timestamp": "2025-09-15 03:20:15.157434", "step": 1296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:15.187504", "step": 1296, "epoch": 2 }, { "type": "loss", "content": 0.014967325143516064, "timestamp": "2025-09-15 03:20:15.189652", "step": 1297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.220167", "step": 1297, "epoch": 2 }, { "type": "loss", "content": 0.022020941600203514, "timestamp": "2025-09-15 03:20:15.222368", "step": 1298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:15.252172", "step": 1298, "epoch": 2 }, { "type": "loss", "content": 0.019915880635380745, "timestamp": "2025-09-15 03:20:15.254183", "step": 1299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.284120", "step": 1299, "epoch": 2 }, { "type": "loss", "content": 0.013933622278273106, "timestamp": "2025-09-15 03:20:15.307468", "step": 1300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:15.338153", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.015517765656113625, "timestamp": "2025-09-15 03:20:15.340302", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:15.370246", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.009845642372965813, "timestamp": "2025-09-15 03:20:15.372527", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.403191", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.011282161809504032, "timestamp": "2025-09-15 03:20:15.405198", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.435349", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.004566065035760403, "timestamp": "2025-09-15 03:20:15.458956", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.489372", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.012090700678527355, "timestamp": "2025-09-15 03:20:15.491505", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:15.523252", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.0067639728076756, "timestamp": "2025-09-15 03:20:15.525430", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.555625", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.0149248531088233, "timestamp": "2025-09-15 03:20:15.557641", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.587512", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.015327530913054943, "timestamp": "2025-09-15 03:20:15.611240", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.640899", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.007515019737184048, "timestamp": "2025-09-15 03:20:15.643245", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:15.673234", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.01602526567876339, "timestamp": "2025-09-15 03:20:15.675673", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:15.706729", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.021097447723150253, "timestamp": "2025-09-15 03:20:15.709472", "step": 1311, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:16.417438", "step": 1311, "epoch": 2 }, { "type": "pplx", "content": 76183460.07738191, "timestamp": "2025-09-15 03:20:16.419855", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.449079", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.003573415335267782, "timestamp": "2025-09-15 03:20:16.472625", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.502606", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.005960460286587477, "timestamp": "2025-09-15 03:20:16.504818", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.535033", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.0011057720985263586, "timestamp": "2025-09-15 03:20:16.537015", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:16.567323", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.012723295949399471, "timestamp": "2025-09-15 03:20:16.569501", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.599734", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.0056397877633571625, "timestamp": "2025-09-15 03:20:16.623539", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.653233", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.01716693863272667, "timestamp": "2025-09-15 03:20:16.655336", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.685136", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.002221998292952776, "timestamp": "2025-09-15 03:20:16.687158", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:16.716713", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.02662539668381214, "timestamp": "2025-09-15 03:20:16.719082", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:16.748506", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.006457353942096233, "timestamp": "2025-09-15 03:20:16.772164", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.801557", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.04024200886487961, "timestamp": "2025-09-15 03:20:16.803679", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:16.833915", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.02188965305685997, "timestamp": "2025-09-15 03:20:16.836081", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:16.866032", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.010733796283602715, "timestamp": "2025-09-15 03:20:16.868518", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.899945", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.00566983362659812, "timestamp": "2025-09-15 03:20:16.924177", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:16.954756", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.00432900246232748, "timestamp": "2025-09-15 03:20:16.957317", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:16.987368", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.006719162221997976, "timestamp": "2025-09-15 03:20:16.989682", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.020100", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.005306191276758909, "timestamp": "2025-09-15 03:20:17.022133", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:17.052089", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.008937738835811615, "timestamp": "2025-09-15 03:20:17.075674", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:17.106078", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.0164463073015213, "timestamp": "2025-09-15 03:20:17.108150", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.137462", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.005517881829291582, "timestamp": "2025-09-15 03:20:17.139502", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.169019", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.041902415454387665, "timestamp": "2025-09-15 03:20:17.171133", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.200749", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.01374905463308096, "timestamp": "2025-09-15 03:20:17.224404", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.254785", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.00975362304598093, "timestamp": "2025-09-15 03:20:17.256854", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.286711", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.013099453411996365, "timestamp": "2025-09-15 03:20:17.288810", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.318725", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.021935611963272095, "timestamp": "2025-09-15 03:20:17.320810", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.350834", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.00626968452706933, "timestamp": "2025-09-15 03:20:17.374473", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:17.406392", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.009117616340517998, "timestamp": "2025-09-15 03:20:17.408597", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.438098", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.015381724573671818, "timestamp": "2025-09-15 03:20:17.440302", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:17.469968", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.003806751687079668, "timestamp": "2025-09-15 03:20:17.472070", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:17.502164", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.013369276188313961, "timestamp": "2025-09-15 03:20:17.525683", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:17.556028", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.020524680614471436, "timestamp": "2025-09-15 03:20:17.558096", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:17.588263", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.01145204808562994, "timestamp": "2025-09-15 03:20:17.590499", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:17.619962", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.010058576241135597, "timestamp": "2025-09-15 03:20:17.622051", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.652033", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.013884322717785835, "timestamp": "2025-09-15 03:20:17.675582", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.705407", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.0023034908808767796, "timestamp": "2025-09-15 03:20:17.707744", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.737784", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.014699487946927547, "timestamp": "2025-09-15 03:20:17.739803", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:17.769943", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.013409526087343693, "timestamp": "2025-09-15 03:20:17.772254", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.805393", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.00270974007435143, "timestamp": "2025-09-15 03:20:17.828877", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:17.864618", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.002560506807640195, "timestamp": "2025-09-15 03:20:17.866751", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.896879", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.0075851972214877605, "timestamp": "2025-09-15 03:20:17.899159", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:17.928813", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.008130542933940887, "timestamp": "2025-09-15 03:20:17.931417", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:17.961170", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.0021126719657331705, "timestamp": "2025-09-15 03:20:17.984810", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.015225", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.004403482656925917, "timestamp": "2025-09-15 03:20:18.017240", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.047055", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.018507298082113266, "timestamp": "2025-09-15 03:20:18.049010", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.078714", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.019040079787373543, "timestamp": "2025-09-15 03:20:18.080781", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.110488", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.027470026165246964, "timestamp": "2025-09-15 03:20:18.134088", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.163629", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.007900933735072613, "timestamp": "2025-09-15 03:20:18.165588", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.195120", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.007872597314417362, "timestamp": "2025-09-15 03:20:18.197346", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.227225", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.03232871741056442, "timestamp": "2025-09-15 03:20:18.229301", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:18.258742", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.0032297975849360228, "timestamp": "2025-09-15 03:20:18.282326", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:18.312465", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.03657596930861473, "timestamp": "2025-09-15 03:20:18.314889", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.344739", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.01493586041033268, "timestamp": "2025-09-15 03:20:18.347044", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.376829", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.005877330899238586, "timestamp": "2025-09-15 03:20:18.378935", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:18.408795", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.00430047744885087, "timestamp": "2025-09-15 03:20:18.432458", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.479134", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.015451534651219845, "timestamp": "2025-09-15 03:20:18.481520", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:18.512230", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.00832511205226183, "timestamp": "2025-09-15 03:20:18.514429", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.546481", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.012594794854521751, "timestamp": "2025-09-15 03:20:18.548572", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:18.579123", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.010724768042564392, "timestamp": "2025-09-15 03:20:18.602799", "step": 1368, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:19.313696", "step": 1368, "epoch": 2 }, { "type": "pplx", "content": 78061737.8925938, "timestamp": "2025-09-15 03:20:19.315841", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.344404", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.0019664892461150885, "timestamp": "2025-09-15 03:20:19.346500", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.376283", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.006835191044956446, "timestamp": "2025-09-15 03:20:19.378404", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:19.408567", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.007734772749245167, "timestamp": "2025-09-15 03:20:19.411233", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:19.441162", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.006751236040145159, "timestamp": "2025-09-15 03:20:19.464757", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.494504", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.013009068556129932, "timestamp": "2025-09-15 03:20:19.496913", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.526972", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.02262653224170208, "timestamp": "2025-09-15 03:20:19.529051", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:19.559128", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.02342197671532631, "timestamp": "2025-09-15 03:20:19.561402", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.591457", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.008266033604741096, "timestamp": "2025-09-15 03:20:19.615078", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:19.645058", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.0218669343739748, "timestamp": "2025-09-15 03:20:19.647391", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.677539", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.009586125612258911, "timestamp": "2025-09-15 03:20:19.679962", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.710403", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.01921851374208927, "timestamp": "2025-09-15 03:20:19.712736", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.743105", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.003876746166497469, "timestamp": "2025-09-15 03:20:19.766546", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.796199", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.02817346155643463, "timestamp": "2025-09-15 03:20:19.798283", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.827830", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.012659131549298763, "timestamp": "2025-09-15 03:20:19.829820", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.859941", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.011585528962314129, "timestamp": "2025-09-15 03:20:19.861996", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.891722", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.0018231567228212953, "timestamp": "2025-09-15 03:20:19.915232", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.945415", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.020531652495265007, "timestamp": "2025-09-15 03:20:19.947551", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:19.976757", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.016559377312660217, "timestamp": "2025-09-15 03:20:19.978808", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.008324", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.028371769934892654, "timestamp": "2025-09-15 03:20:20.011300", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.040999", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.0019516665488481522, "timestamp": "2025-09-15 03:20:20.064504", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.094352", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.01035268884152174, "timestamp": "2025-09-15 03:20:20.096426", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:20.127438", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.0055434927344322205, "timestamp": "2025-09-15 03:20:20.129590", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.160074", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.008157492615282536, "timestamp": "2025-09-15 03:20:20.162591", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.192576", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.006338280625641346, "timestamp": "2025-09-15 03:20:20.216372", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.246324", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.014807418920099735, "timestamp": "2025-09-15 03:20:20.248643", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.278975", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.014586917124688625, "timestamp": "2025-09-15 03:20:20.280978", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.310495", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.005534280091524124, "timestamp": "2025-09-15 03:20:20.312633", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.343524", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.008956083096563816, "timestamp": "2025-09-15 03:20:20.367164", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.396889", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.016665924340486526, "timestamp": "2025-09-15 03:20:20.399309", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.430126", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.03966280072927475, "timestamp": "2025-09-15 03:20:20.431935", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.462178", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.0075687081553041935, "timestamp": "2025-09-15 03:20:20.464648", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.496864", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.006370862480252981, "timestamp": "2025-09-15 03:20:20.521766", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.553858", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.005345536861568689, "timestamp": "2025-09-15 03:20:20.566804", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.606600", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.007125379052013159, "timestamp": "2025-09-15 03:20:20.608755", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.646623", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.007548754569143057, "timestamp": "2025-09-15 03:20:20.648686", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:20.683741", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.027229715138673782, "timestamp": "2025-09-15 03:20:20.707863", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.742049", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.014742210507392883, "timestamp": "2025-09-15 03:20:20.744183", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.774779", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.021080315113067627, "timestamp": "2025-09-15 03:20:20.777052", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.807418", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.0020817893091589212, "timestamp": "2025-09-15 03:20:20.810531", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.840338", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.017813878133893013, "timestamp": "2025-09-15 03:20:20.863928", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:20.896500", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.018892155960202217, "timestamp": "2025-09-15 03:20:20.898656", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.929420", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.0018157056765630841, "timestamp": "2025-09-15 03:20:20.931340", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:20.960709", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.01594122126698494, "timestamp": "2025-09-15 03:20:20.962774", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.008829", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.003443245543166995, "timestamp": "2025-09-15 03:20:21.032477", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.062872", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.0034602934028953314, "timestamp": "2025-09-15 03:20:21.065110", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:21.095084", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.0011383292730897665, "timestamp": "2025-09-15 03:20:21.097240", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:21.131005", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.015220055356621742, "timestamp": "2025-09-15 03:20:21.133169", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.168698", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.007584023289382458, "timestamp": "2025-09-15 03:20:21.194194", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.223987", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.001142369001172483, "timestamp": "2025-09-15 03:20:21.226997", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.256503", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.002171542961150408, "timestamp": "2025-09-15 03:20:21.258623", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:21.289055", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.003867406165227294, "timestamp": "2025-09-15 03:20:21.291049", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.320831", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.005698856431990862, "timestamp": "2025-09-15 03:20:21.344290", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.374493", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.00942178163677454, "timestamp": "2025-09-15 03:20:21.376639", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.407065", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.0010488334810361266, "timestamp": "2025-09-15 03:20:21.409201", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:21.438697", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.04225903004407883, "timestamp": "2025-09-15 03:20:21.440742", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:21.470909", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.023032980039715767, "timestamp": "2025-09-15 03:20:21.494406", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:21.524524", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.027132472023367882, "timestamp": "2025-09-15 03:20:21.526603", "step": 1425, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:22.242142", "step": 1425, "epoch": 2 }, { "type": "pplx", "content": 91832825.88411044, "timestamp": "2025-09-15 03:20:22.244081", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.273362", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.0015831494238227606, "timestamp": "2025-09-15 03:20:22.275378", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.305335", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.004346030298620462, "timestamp": "2025-09-15 03:20:22.307413", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.337493", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.03036068007349968, "timestamp": "2025-09-15 03:20:22.362426", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.393852", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.004273244645446539, "timestamp": "2025-09-15 03:20:22.395949", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.426650", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.005714810453355312, "timestamp": "2025-09-15 03:20:22.428749", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.458910", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.01159138698130846, "timestamp": "2025-09-15 03:20:22.460831", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.492127", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.007387206889688969, "timestamp": "2025-09-15 03:20:22.515783", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.545814", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.033014681190252304, "timestamp": "2025-09-15 03:20:22.548005", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:22.577841", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.015044102445244789, "timestamp": "2025-09-15 03:20:22.580007", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.609580", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.008882693946361542, "timestamp": "2025-09-15 03:20:22.611593", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.641333", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.003103381721302867, "timestamp": "2025-09-15 03:20:22.665837", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.697401", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.020440733060240746, "timestamp": "2025-09-15 03:20:22.699629", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.731328", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.007218391168862581, "timestamp": "2025-09-15 03:20:22.734611", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.767966", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.0050024488009512424, "timestamp": "2025-09-15 03:20:22.770042", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.808501", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.0015697681810706854, "timestamp": "2025-09-15 03:20:22.832531", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.863319", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.002475773449987173, "timestamp": "2025-09-15 03:20:22.865304", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.895421", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.002379846991971135, "timestamp": "2025-09-15 03:20:22.897495", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:22.927810", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.01347306091338396, "timestamp": "2025-09-15 03:20:22.932120", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:22.964590", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.0012131101684644818, "timestamp": "2025-09-15 03:20:22.987998", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.017925", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.0056666177697479725, "timestamp": "2025-09-15 03:20:23.020669", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.051241", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.006143883336335421, "timestamp": "2025-09-15 03:20:23.053318", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.085681", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.007649295497685671, "timestamp": "2025-09-15 03:20:23.088064", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.118314", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.007761591114103794, "timestamp": "2025-09-15 03:20:23.144982", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.184242", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.0191853828728199, "timestamp": "2025-09-15 03:20:23.186363", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.216828", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.0046918983571231365, "timestamp": "2025-09-15 03:20:23.221198", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.253773", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.013605318032205105, "timestamp": "2025-09-15 03:20:23.259672", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.289779", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.002763927448540926, "timestamp": "2025-09-15 03:20:23.313174", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.342863", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.0015863854205235839, "timestamp": "2025-09-15 03:20:23.344942", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.374821", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.013510088436305523, "timestamp": "2025-09-15 03:20:23.377065", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.407701", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.01751178875565529, "timestamp": "2025-09-15 03:20:23.409709", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.439914", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.019616883248090744, "timestamp": "2025-09-15 03:20:23.463336", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.494310", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.004575754515826702, "timestamp": "2025-09-15 03:20:23.496146", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.525778", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.019520703703165054, "timestamp": "2025-09-15 03:20:23.527844", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.557502", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.017982807010412216, "timestamp": "2025-09-15 03:20:23.559498", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.589743", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.0008443990955129266, "timestamp": "2025-09-15 03:20:23.613144", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.643377", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.05071615055203438, "timestamp": "2025-09-15 03:20:23.645733", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.676146", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.011467128060758114, "timestamp": "2025-09-15 03:20:23.678560", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.709362", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.022799739614129066, "timestamp": "2025-09-15 03:20:23.711584", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.740840", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.025861812755465508, "timestamp": "2025-09-15 03:20:23.764363", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:23.794537", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.008623288944363594, "timestamp": "2025-09-15 03:20:23.796559", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.826650", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.02509269118309021, "timestamp": "2025-09-15 03:20:23.830355", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.861670", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.03199907764792442, "timestamp": "2025-09-15 03:20:23.863635", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.893808", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.01597598008811474, "timestamp": "2025-09-15 03:20:23.917308", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:23.948214", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.013961074873805046, "timestamp": "2025-09-15 03:20:23.950123", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:23.980236", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.042097508907318115, "timestamp": "2025-09-15 03:20:23.982335", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:24.012732", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.018110528588294983, "timestamp": "2025-09-15 03:20:24.014959", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:24.044851", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.00701131671667099, "timestamp": "2025-09-15 03:20:24.068196", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:24.098245", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.006260588299483061, "timestamp": "2025-09-15 03:20:24.099899", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:24.130081", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.009581341408193111, "timestamp": "2025-09-15 03:20:24.132175", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:24.162336", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.013548241928219795, "timestamp": "2025-09-15 03:20:24.164334", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:24.194673", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.01859314739704132, "timestamp": "2025-09-15 03:20:24.218311", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:24.248929", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.002538888482376933, "timestamp": "2025-09-15 03:20:24.251174", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:24.280996", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.012346216477453709, "timestamp": "2025-09-15 03:20:24.283084", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:24.313323", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.013880938291549683, "timestamp": "2025-09-15 03:20:24.315322", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:24.345084", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.00909118540585041, "timestamp": "2025-09-15 03:20:24.368558", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:24.398625", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.015353651717305183, "timestamp": "2025-09-15 03:20:24.400759", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:24.431330", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.01462968997657299, "timestamp": "2025-09-15 03:20:24.433473", "step": 1482, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:25.140066", "step": 1482, "epoch": 2 }, { "type": "pplx", "content": 94876692.11559597, "timestamp": "2025-09-15 03:20:25.142058", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.171504", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.024286191910505295, "timestamp": "2025-09-15 03:20:25.173551", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.204743", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.0050559681840240955, "timestamp": "2025-09-15 03:20:25.228242", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:25.258220", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.02460642158985138, "timestamp": "2025-09-15 03:20:25.260141", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.290321", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.013014466501772404, "timestamp": "2025-09-15 03:20:25.292458", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.321826", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.004653667565435171, "timestamp": "2025-09-15 03:20:25.323964", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.355568", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.006579217966645956, "timestamp": "2025-09-15 03:20:25.379293", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.409125", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.01029092725366354, "timestamp": "2025-09-15 03:20:25.411476", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:25.443252", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.017754560336470604, "timestamp": "2025-09-15 03:20:25.445379", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.475341", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.014583499170839787, "timestamp": "2025-09-15 03:20:25.477270", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.507864", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.010781803168356419, "timestamp": "2025-09-15 03:20:25.531483", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.561278", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.006159218959510326, "timestamp": "2025-09-15 03:20:25.563663", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:25.595076", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.013684733770787716, "timestamp": "2025-09-15 03:20:25.597079", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:25.626828", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.041964296251535416, "timestamp": "2025-09-15 03:20:25.629397", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.659260", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.0167537909001112, "timestamp": "2025-09-15 03:20:25.682683", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.712965", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.005579576827585697, "timestamp": "2025-09-15 03:20:25.715181", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:25.745062", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.007810843177139759, "timestamp": "2025-09-15 03:20:25.747280", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:25.777415", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.011461307294666767, "timestamp": "2025-09-15 03:20:25.779512", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:25.809270", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.019440533593297005, "timestamp": "2025-09-15 03:20:25.833162", "step": 1500, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-09-15 03:20:32.277364", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:32.321382", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.00861346535384655, "timestamp": "2025-09-15 03:20:32.323847", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.355255", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.00922238826751709, "timestamp": "2025-09-15 03:20:32.357272", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.387424", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.004329313989728689, "timestamp": "2025-09-15 03:20:32.389466", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.420808", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.008352418430149555, "timestamp": "2025-09-15 03:20:32.444039", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:32.474059", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.02553441934287548, "timestamp": "2025-09-15 03:20:32.476264", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.506486", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.027252767235040665, "timestamp": "2025-09-15 03:20:32.508477", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.538637", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.030122382566332817, "timestamp": "2025-09-15 03:20:32.540479", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:20:32.571620", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.017167910933494568, "timestamp": "2025-09-15 03:20:32.596530", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.626498", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.008571980521082878, "timestamp": "2025-09-15 03:20:32.628596", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.658742", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.018993262201547623, "timestamp": "2025-09-15 03:20:32.660773", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:32.691130", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.0016351805534213781, "timestamp": "2025-09-15 03:20:32.693101", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.723715", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.013522188179194927, "timestamp": "2025-09-15 03:20:32.747448", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.777603", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.01635715551674366, "timestamp": "2025-09-15 03:20:32.779678", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:32.810168", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.018260132521390915, "timestamp": "2025-09-15 03:20:32.812231", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.842217", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.008778615854680538, "timestamp": "2025-09-15 03:20:32.844246", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:32.874101", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.04932339861989021, "timestamp": "2025-09-15 03:20:32.897674", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.927801", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.00995201151818037, "timestamp": "2025-09-15 03:20:32.929887", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.960313", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.02796383574604988, "timestamp": "2025-09-15 03:20:32.962499", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:32.992497", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.015218219719827175, "timestamp": "2025-09-15 03:20:32.999792", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.032125", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.0046564312651753426, "timestamp": "2025-09-15 03:20:33.055587", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.085373", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.015503061935305595, "timestamp": "2025-09-15 03:20:33.087396", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.118251", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.007008845917880535, "timestamp": "2025-09-15 03:20:33.120476", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:33.150485", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.01042736042290926, "timestamp": "2025-09-15 03:20:33.152513", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.182214", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.01703822799026966, "timestamp": "2025-09-15 03:20:33.205770", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:33.235781", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.015462229959666729, "timestamp": "2025-09-15 03:20:33.237778", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.269916", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.012268836610019207, "timestamp": "2025-09-15 03:20:33.271964", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:33.302214", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.012304599396884441, "timestamp": "2025-09-15 03:20:33.304579", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.334831", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.03490256145596504, "timestamp": "2025-09-15 03:20:33.358375", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.389082", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.028470784425735474, "timestamp": "2025-09-15 03:20:33.391015", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.421119", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.02890247479081154, "timestamp": "2025-09-15 03:20:33.423079", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.453882", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.007559936959296465, "timestamp": "2025-09-15 03:20:33.456034", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.485878", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.02463674731552601, "timestamp": "2025-09-15 03:20:33.509906", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.539500", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.016762617975473404, "timestamp": "2025-09-15 03:20:33.541563", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:33.572423", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.028797050938010216, "timestamp": "2025-09-15 03:20:33.574442", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.604887", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.02649320848286152, "timestamp": "2025-09-15 03:20:33.607124", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.637901", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.014305144548416138, "timestamp": "2025-09-15 03:20:33.661244", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:33.691317", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.007026375271379948, "timestamp": "2025-09-15 03:20:33.693222", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:33.723287", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.03457867354154587, "timestamp": "2025-09-15 03:20:33.725545", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:33.755631", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.004672153387218714, "timestamp": "2025-09-15 03:20:33.757686", "step": 1539, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:34.478807", "step": 1539, "epoch": 2 }, { "type": "pplx", "content": 87727919.03047407, "timestamp": "2025-09-15 03:20:34.480626", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.509237", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.014389991760253906, "timestamp": "2025-09-15 03:20:34.532819", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.562765", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.03513328358530998, "timestamp": "2025-09-15 03:20:34.564976", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.594929", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.014334970153868198, "timestamp": "2025-09-15 03:20:34.596912", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.627022", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.0023369737900793552, "timestamp": "2025-09-15 03:20:34.629179", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.661102", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.0029845726676285267, "timestamp": "2025-09-15 03:20:34.684666", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.714562", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.006009969394654036, "timestamp": "2025-09-15 03:20:34.716749", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.746941", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.045693524181842804, "timestamp": "2025-09-15 03:20:34.748985", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.777963", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.04461913928389549, "timestamp": "2025-09-15 03:20:34.779988", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.809877", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.005639377050101757, "timestamp": "2025-09-15 03:20:34.833176", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:34.863635", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.0031557695474475622, "timestamp": "2025-09-15 03:20:34.865557", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.895729", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.0018140083411708474, "timestamp": "2025-09-15 03:20:34.898667", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:34.928439", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.009812816977500916, "timestamp": "2025-09-15 03:20:34.930575", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:34.960557", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.01942797377705574, "timestamp": "2025-09-15 03:20:34.984241", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.014609", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.007379674352705479, "timestamp": "2025-09-15 03:20:35.016594", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.046691", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.0004087972338311374, "timestamp": "2025-09-15 03:20:35.048855", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.078891", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.038724932819604874, "timestamp": "2025-09-15 03:20:35.080968", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.110396", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.024620456621050835, "timestamp": "2025-09-15 03:20:35.133766", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.164411", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.0014810477150604129, "timestamp": "2025-09-15 03:20:35.166483", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.196324", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.005852595437318087, "timestamp": "2025-09-15 03:20:35.198654", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.229262", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.008139959536492825, "timestamp": "2025-09-15 03:20:35.231397", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.261123", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.002672811970114708, "timestamp": "2025-09-15 03:20:35.284481", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.314509", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.02692270092666149, "timestamp": "2025-09-15 03:20:35.316712", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.346736", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.004136955831199884, "timestamp": "2025-09-15 03:20:35.348698", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.378193", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.021433161571621895, "timestamp": "2025-09-15 03:20:35.380290", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.409743", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.00741331884637475, "timestamp": "2025-09-15 03:20:35.433291", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.464055", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.0038252919912338257, "timestamp": "2025-09-15 03:20:35.466176", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:35.496187", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.0054579381830990314, "timestamp": "2025-09-15 03:20:35.498182", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:35.527562", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.0015629673143848777, "timestamp": "2025-09-15 03:20:35.529903", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.560546", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.020011266693472862, "timestamp": "2025-09-15 03:20:35.584009", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.613816", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.005876597017049789, "timestamp": "2025-09-15 03:20:35.615820", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.645785", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.013301116414368153, "timestamp": "2025-09-15 03:20:35.648118", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.677829", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.015047280117869377, "timestamp": "2025-09-15 03:20:35.679844", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.709852", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.005345091689378023, "timestamp": "2025-09-15 03:20:35.733408", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.764632", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.003994218073785305, "timestamp": "2025-09-15 03:20:35.766847", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.796720", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.0005644602351821959, "timestamp": "2025-09-15 03:20:35.798752", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.829468", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.016421610489487648, "timestamp": "2025-09-15 03:20:35.831600", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:35.862268", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.0015365204308182001, "timestamp": "2025-09-15 03:20:35.885744", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.917981", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.007548271678388119, "timestamp": "2025-09-15 03:20:35.922096", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.953143", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.020572826266288757, "timestamp": "2025-09-15 03:20:35.955416", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:35.985754", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.0013158052461221814, "timestamp": "2025-09-15 03:20:35.987871", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.018117", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.010661915875971317, "timestamp": "2025-09-15 03:20:36.041489", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:36.073728", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.01249182689934969, "timestamp": "2025-09-15 03:20:36.075661", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.105197", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.0041304114274680614, "timestamp": "2025-09-15 03:20:36.107374", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.137254", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.008346246555447578, "timestamp": "2025-09-15 03:20:36.139348", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.170728", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.0073064700700342655, "timestamp": "2025-09-15 03:20:36.194207", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.224480", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.009679530747234821, "timestamp": "2025-09-15 03:20:36.226678", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.256676", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.023354899138212204, "timestamp": "2025-09-15 03:20:36.258595", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:36.288494", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.04176362231373787, "timestamp": "2025-09-15 03:20:36.290651", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.320296", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.0026484071277081966, "timestamp": "2025-09-15 03:20:36.343661", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.374183", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.01016245223581791, "timestamp": "2025-09-15 03:20:36.376232", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.406830", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.016892336308956146, "timestamp": "2025-09-15 03:20:36.408872", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.438869", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.03908044844865799, "timestamp": "2025-09-15 03:20:36.441365", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.471395", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.059734683483839035, "timestamp": "2025-09-15 03:20:36.494739", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.524810", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.008856425061821938, "timestamp": "2025-09-15 03:20:36.526888", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.556640", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.008398907259106636, "timestamp": "2025-09-15 03:20:36.558489", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:36.589197", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.015620146878063679, "timestamp": "2025-09-15 03:20:36.591171", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:36.620925", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.01876661740243435, "timestamp": "2025-09-15 03:20:36.644417", "step": 1596, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:37.356740", "step": 1596, "epoch": 2 }, { "type": "pplx", "content": 75713161.11239102, "timestamp": "2025-09-15 03:20:37.358839", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.386785", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.005393002647906542, "timestamp": "2025-09-15 03:20:37.388893", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:37.418998", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.00764166796579957, "timestamp": "2025-09-15 03:20:37.422029", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:37.452195", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.01738116517663002, "timestamp": "2025-09-15 03:20:37.456104", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.487884", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.010397247970104218, "timestamp": "2025-09-15 03:20:37.511360", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.541090", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.0032066632993519306, "timestamp": "2025-09-15 03:20:37.543093", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.573301", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.031752921640872955, "timestamp": "2025-09-15 03:20:37.575458", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.605808", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.018547099083662033, "timestamp": "2025-09-15 03:20:37.607971", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.638177", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.0020383193623274565, "timestamp": "2025-09-15 03:20:37.661681", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.691809", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.005079983733594418, "timestamp": "2025-09-15 03:20:37.693892", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.723693", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.03111599199473858, "timestamp": "2025-09-15 03:20:37.725664", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.755590", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.00759871955960989, "timestamp": "2025-09-15 03:20:37.757792", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.788562", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.00797184742987156, "timestamp": "2025-09-15 03:20:37.811982", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.841919", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.006208633538335562, "timestamp": "2025-09-15 03:20:37.843901", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.874674", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.026710880920290947, "timestamp": "2025-09-15 03:20:37.876830", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:37.906920", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.036108966916799545, "timestamp": "2025-09-15 03:20:37.909005", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:37.939214", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.0037150667048990726, "timestamp": "2025-09-15 03:20:37.962677", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:37.993605", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.013364890590310097, "timestamp": "2025-09-15 03:20:37.996141", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.027672", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.007008062209933996, "timestamp": "2025-09-15 03:20:38.029877", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.059989", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.0037146012764424086, "timestamp": "2025-09-15 03:20:38.062242", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.092622", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.01745392195880413, "timestamp": "2025-09-15 03:20:38.116174", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.146420", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.01895240880548954, "timestamp": "2025-09-15 03:20:38.148505", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:38.179417", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.0034704965073615313, "timestamp": "2025-09-15 03:20:38.181572", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.211278", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.009162304922938347, "timestamp": "2025-09-15 03:20:38.213647", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.244105", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.017583999782800674, "timestamp": "2025-09-15 03:20:38.267465", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.297701", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.007137224078178406, "timestamp": "2025-09-15 03:20:38.299808", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:38.329946", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.01079423725605011, "timestamp": "2025-09-15 03:20:38.332273", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:38.362321", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.001529446104541421, "timestamp": "2025-09-15 03:20:38.364410", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.395059", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.0021572206169366837, "timestamp": "2025-09-15 03:20:38.418473", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.448592", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.010212992317974567, "timestamp": "2025-09-15 03:20:38.450798", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.480604", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.00565464049577713, "timestamp": "2025-09-15 03:20:38.482563", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.512439", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.048854947090148926, "timestamp": "2025-09-15 03:20:38.514561", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:38.544278", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.008677697740495205, "timestamp": "2025-09-15 03:20:38.568169", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.604878", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.017185676842927933, "timestamp": "2025-09-15 03:20:38.606911", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.642142", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.00775103410705924, "timestamp": "2025-09-15 03:20:38.644605", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.677289", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.008807477541267872, "timestamp": "2025-09-15 03:20:38.679421", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:38.709461", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.029624175280332565, "timestamp": "2025-09-15 03:20:38.732945", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.763295", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.017720164731144905, "timestamp": "2025-09-15 03:20:38.765594", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.795122", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.006194696761667728, "timestamp": "2025-09-15 03:20:38.797302", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.826968", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.010376783087849617, "timestamp": "2025-09-15 03:20:38.829059", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.859224", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.008767379447817802, "timestamp": "2025-09-15 03:20:38.888963", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:38.919455", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.020663250237703323, "timestamp": "2025-09-15 03:20:38.921569", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.951323", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.0013582947431132197, "timestamp": "2025-09-15 03:20:38.953791", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:38.984311", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.002349320100620389, "timestamp": "2025-09-15 03:20:38.986506", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.016196", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.004257179331034422, "timestamp": "2025-09-15 03:20:39.039577", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.069141", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.02164802886545658, "timestamp": "2025-09-15 03:20:39.071310", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.101676", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.018588954582810402, "timestamp": "2025-09-15 03:20:39.104122", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:39.134863", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.007363787852227688, "timestamp": "2025-09-15 03:20:39.137090", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:39.167079", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.012514977715909481, "timestamp": "2025-09-15 03:20:39.190666", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.220712", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.008080360479652882, "timestamp": "2025-09-15 03:20:39.222765", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.253110", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.007632083725184202, "timestamp": "2025-09-15 03:20:39.255766", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.285442", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.007638550829142332, "timestamp": "2025-09-15 03:20:39.287989", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.318621", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.009446695446968079, "timestamp": "2025-09-15 03:20:39.342132", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.372016", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.018892496824264526, "timestamp": "2025-09-15 03:20:39.374197", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:39.405886", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.007213030010461807, "timestamp": "2025-09-15 03:20:39.408188", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:39.438502", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.0050399452447891235, "timestamp": "2025-09-15 03:20:39.440600", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:39.470452", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.0027350725140422583, "timestamp": "2025-09-15 03:20:39.493928", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:39.523699", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.017289655283093452, "timestamp": "2025-09-15 03:20:39.525776", "step": 1653, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:40.234928", "step": 1653, "epoch": 2 }, { "type": "pplx", "content": 78568448.51048347, "timestamp": "2025-09-15 03:20:40.237391", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.265970", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.010751119814813137, "timestamp": "2025-09-15 03:20:40.268607", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.298419", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.006664137355983257, "timestamp": "2025-09-15 03:20:40.300736", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.330983", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.005810289643704891, "timestamp": "2025-09-15 03:20:40.354617", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.391034", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.024757912382483482, "timestamp": "2025-09-15 03:20:40.393142", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:40.424332", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.024632303044199944, "timestamp": "2025-09-15 03:20:40.426236", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.456340", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.008729967288672924, "timestamp": "2025-09-15 03:20:40.458998", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.488944", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.006651734001934528, "timestamp": "2025-09-15 03:20:40.513017", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:40.543613", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.02994653768837452, "timestamp": "2025-09-15 03:20:40.545667", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.575843", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.009992452338337898, "timestamp": "2025-09-15 03:20:40.578324", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.608450", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.007453125435858965, "timestamp": "2025-09-15 03:20:40.610548", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.640807", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.003623406635597348, "timestamp": "2025-09-15 03:20:40.664292", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.693917", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.013959839940071106, "timestamp": "2025-09-15 03:20:40.695814", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.726574", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.008011700585484505, "timestamp": "2025-09-15 03:20:40.728630", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.758656", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.01187701802700758, "timestamp": "2025-09-15 03:20:40.760910", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.791848", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.015608700923621655, "timestamp": "2025-09-15 03:20:40.815628", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.846081", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.04297996684908867, "timestamp": "2025-09-15 03:20:40.848291", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:40.878298", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.005474329926073551, "timestamp": "2025-09-15 03:20:40.880300", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:40.910196", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.00804048590362072, "timestamp": "2025-09-15 03:20:40.912238", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:40.941921", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.006009248550981283, "timestamp": "2025-09-15 03:20:40.966692", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:40.997034", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.011756404303014278, "timestamp": "2025-09-15 03:20:41.001791", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.034752", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.0031918776221573353, "timestamp": "2025-09-15 03:20:41.037069", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.067133", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.0008713051793165505, "timestamp": "2025-09-15 03:20:41.069415", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.099794", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.003177103353664279, "timestamp": "2025-09-15 03:20:41.123206", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.152822", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.0052427686750888824, "timestamp": "2025-09-15 03:20:41.154997", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.184416", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.002482310403138399, "timestamp": "2025-09-15 03:20:41.186424", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:41.216301", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.008572629652917385, "timestamp": "2025-09-15 03:20:41.218792", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.249469", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.015897506847977638, "timestamp": "2025-09-15 03:20:41.273375", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.303511", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.005093819461762905, "timestamp": "2025-09-15 03:20:41.305513", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.336699", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.003218486439436674, "timestamp": "2025-09-15 03:20:41.338853", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.369209", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.00489377835765481, "timestamp": "2025-09-15 03:20:41.371412", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.401557", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.0022957702167332172, "timestamp": "2025-09-15 03:20:41.425070", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.455113", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.01362029928714037, "timestamp": "2025-09-15 03:20:41.457648", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.487373", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.006150937173515558, "timestamp": "2025-09-15 03:20:41.489644", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.519530", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.03606478497385979, "timestamp": "2025-09-15 03:20:41.521775", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:41.551690", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.047492969781160355, "timestamp": "2025-09-15 03:20:41.575323", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.605419", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.0005590234650298953, "timestamp": "2025-09-15 03:20:41.607620", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.638101", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.04679226502776146, "timestamp": "2025-09-15 03:20:41.640128", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.670723", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.0004176944785285741, "timestamp": "2025-09-15 03:20:41.672979", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:41.702924", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.024234410375356674, "timestamp": "2025-09-15 03:20:41.726381", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.756427", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.012740524485707283, "timestamp": "2025-09-15 03:20:41.758288", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.788508", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.003929396625608206, "timestamp": "2025-09-15 03:20:41.790747", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.820472", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.0006833495572209358, "timestamp": "2025-09-15 03:20:41.822335", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:41.851921", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.04751553386449814, "timestamp": "2025-09-15 03:20:41.875347", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.905416", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.0025537125766277313, "timestamp": "2025-09-15 03:20:41.907647", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:41.937629", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.009955295361578465, "timestamp": "2025-09-15 03:20:41.939876", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:41.970006", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.0014152992516756058, "timestamp": "2025-09-15 03:20:41.972056", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.002196", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.01002996414899826, "timestamp": "2025-09-15 03:20:42.025414", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:42.055554", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.022007791325449944, "timestamp": "2025-09-15 03:20:42.057517", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.087247", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.00791712012141943, "timestamp": "2025-09-15 03:20:42.090377", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:42.120682", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.0015882424777373672, "timestamp": "2025-09-15 03:20:42.122927", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.153266", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.0013538190396502614, "timestamp": "2025-09-15 03:20:42.176636", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.208301", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.004893905948847532, "timestamp": "2025-09-15 03:20:42.210359", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.241820", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.022312074899673462, "timestamp": "2025-09-15 03:20:42.244028", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:42.274923", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.001479136641137302, "timestamp": "2025-09-15 03:20:42.276865", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.307119", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.004247893113642931, "timestamp": "2025-09-15 03:20:42.330547", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.360325", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.01009181048721075, "timestamp": "2025-09-15 03:20:42.362291", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:42.391910", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.001586645608767867, "timestamp": "2025-09-15 03:20:42.395118", "step": 1710, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:43.117474", "step": 1710, "epoch": 2 }, { "type": "pplx", "content": 83493489.47252946, "timestamp": "2025-09-15 03:20:43.119598", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.148336", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.015090351924300194, "timestamp": "2025-09-15 03:20:43.150343", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.180832", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.01114923320710659, "timestamp": "2025-09-15 03:20:43.204356", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.234968", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.006485131569206715, "timestamp": "2025-09-15 03:20:43.237163", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.266997", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.005379594396799803, "timestamp": "2025-09-15 03:20:43.269041", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:43.299839", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.002022078027948737, "timestamp": "2025-09-15 03:20:43.301843", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.332526", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.004368626978248358, "timestamp": "2025-09-15 03:20:43.356377", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.387317", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.03480208292603493, "timestamp": "2025-09-15 03:20:43.389314", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.420851", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.008094354532659054, "timestamp": "2025-09-15 03:20:43.422898", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:43.453099", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.014233270660042763, "timestamp": "2025-09-15 03:20:43.455116", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.485088", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.019168147817254066, "timestamp": "2025-09-15 03:20:43.509096", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.538772", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.0029123888816684484, "timestamp": "2025-09-15 03:20:43.540775", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.570988", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.010065059177577496, "timestamp": "2025-09-15 03:20:43.573220", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.602709", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.005632904823869467, "timestamp": "2025-09-15 03:20:43.605318", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.635486", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.0026095120701938868, "timestamp": "2025-09-15 03:20:43.659098", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:43.689238", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.0018347349250689149, "timestamp": "2025-09-15 03:20:43.691287", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:43.721825", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.0057741072960197926, "timestamp": "2025-09-15 03:20:43.723926", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:43.753833", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.02014450542628765, "timestamp": "2025-09-15 03:20:43.756141", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.787069", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.003958654589951038, "timestamp": "2025-09-15 03:20:43.810292", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.841359", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.003618575632572174, "timestamp": "2025-09-15 03:20:43.843511", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.873413", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.0016862701158970594, "timestamp": "2025-09-15 03:20:43.875673", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:43.905561", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.0172065868973732, "timestamp": "2025-09-15 03:20:43.907718", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:43.938296", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.0003309242019895464, "timestamp": "2025-09-15 03:20:43.961829", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:43.991898", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.0021879069972783327, "timestamp": "2025-09-15 03:20:43.994071", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.024146", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.009583430364727974, "timestamp": "2025-09-15 03:20:44.026144", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.056272", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.006018994841724634, "timestamp": "2025-09-15 03:20:44.058080", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.088023", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.0028155508916825056, "timestamp": "2025-09-15 03:20:44.111442", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.141841", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.009163476526737213, "timestamp": "2025-09-15 03:20:44.143912", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:44.175174", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.009106485173106194, "timestamp": "2025-09-15 03:20:44.188879", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.228187", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.0013123898534104228, "timestamp": "2025-09-15 03:20:44.233528", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.264347", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.0030263494700193405, "timestamp": "2025-09-15 03:20:44.287704", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:44.318072", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.02149033546447754, "timestamp": "2025-09-15 03:20:44.320075", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:44.353689", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.0038601472042500973, "timestamp": "2025-09-15 03:20:44.356010", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.386100", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.0044944509863853455, "timestamp": "2025-09-15 03:20:44.388029", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.417741", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.011085770092904568, "timestamp": "2025-09-15 03:20:44.442293", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.473336", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.0032526575960218906, "timestamp": "2025-09-15 03:20:44.475200", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.505177", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.002188687212765217, "timestamp": "2025-09-15 03:20:44.509598", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.539571", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.0019085209351032972, "timestamp": "2025-09-15 03:20:44.541532", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.571388", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.0015842962311580777, "timestamp": "2025-09-15 03:20:44.594903", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.625855", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.0016496534226462245, "timestamp": "2025-09-15 03:20:44.631515", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.662592", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.0018758989172056317, "timestamp": "2025-09-15 03:20:44.664945", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:44.695562", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.019384147599339485, "timestamp": "2025-09-15 03:20:44.697856", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.729048", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.0029005541000515223, "timestamp": "2025-09-15 03:20:44.752789", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.782658", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.016111260280013084, "timestamp": "2025-09-15 03:20:44.788194", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:44.823184", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.0369202196598053, "timestamp": "2025-09-15 03:20:44.826050", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.856300", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.002441554795950651, "timestamp": "2025-09-15 03:20:44.858273", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.888430", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.005904212594032288, "timestamp": "2025-09-15 03:20:44.916851", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:44.952393", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.0009123924537561834, "timestamp": "2025-09-15 03:20:44.954588", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:44.984359", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.0018833059584721923, "timestamp": "2025-09-15 03:20:44.986753", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:45.016376", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.001379848108626902, "timestamp": "2025-09-15 03:20:45.018616", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:45.048322", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.025038572028279305, "timestamp": "2025-09-15 03:20:45.071809", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:45.102091", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.0022791826631873846, "timestamp": "2025-09-15 03:20:45.105200", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:45.135687", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.00048445953871123493, "timestamp": "2025-09-15 03:20:45.138112", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:45.170663", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.0004319077415857464, "timestamp": "2025-09-15 03:20:45.172913", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:45.202911", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.007473528850823641, "timestamp": "2025-09-15 03:20:45.226569", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:45.257991", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.005291355308145285, "timestamp": "2025-09-15 03:20:45.260130", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:45.289901", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.0004802733601536602, "timestamp": "2025-09-15 03:20:45.292068", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:45.322206", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.00027232806314714253, "timestamp": "2025-09-15 03:20:45.324542", "step": 1767, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:46.042368", "step": 1767, "epoch": 2 }, { "type": "pplx", "content": 84625213.86497217, "timestamp": "2025-09-15 03:20:46.044474", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.072755", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.00040455401176586747, "timestamp": "2025-09-15 03:20:46.096280", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:46.126083", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.013869697228074074, "timestamp": "2025-09-15 03:20:46.127945", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.158184", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.0016579412622377276, "timestamp": "2025-09-15 03:20:46.160233", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:46.191522", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.0013221147237345576, "timestamp": "2025-09-15 03:20:46.193724", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.223768", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.00014326020027510822, "timestamp": "2025-09-15 03:20:46.247221", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.277266", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.0008943701977841556, "timestamp": "2025-09-15 03:20:46.279598", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.310031", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.0063716270960867405, "timestamp": "2025-09-15 03:20:46.312096", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:46.341770", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.0200633704662323, "timestamp": "2025-09-15 03:20:46.344779", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.374980", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.0015325994463637471, "timestamp": "2025-09-15 03:20:46.398474", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.430797", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.0016208905726671219, "timestamp": "2025-09-15 03:20:46.432859", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.462788", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.0006189151317812502, "timestamp": "2025-09-15 03:20:46.465097", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:46.495495", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.0018870577914640307, "timestamp": "2025-09-15 03:20:46.497784", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.527883", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.0016338779823854566, "timestamp": "2025-09-15 03:20:46.551495", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.581357", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.0007233429932966828, "timestamp": "2025-09-15 03:20:46.583323", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.613213", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.0027278957422822714, "timestamp": "2025-09-15 03:20:46.615137", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:46.645534", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.0006363813881762326, "timestamp": "2025-09-15 03:20:46.649028", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.679027", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.0005047211307100952, "timestamp": "2025-09-15 03:20:46.702401", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.732556", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.0002988137421198189, "timestamp": "2025-09-15 03:20:46.734597", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:46.764920", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.0006779587711207569, "timestamp": "2025-09-15 03:20:46.767073", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.796271", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.016672402620315552, "timestamp": "2025-09-15 03:20:46.798085", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.827670", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.0010021728230640292, "timestamp": "2025-09-15 03:20:46.851165", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.885174", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.0002892357297241688, "timestamp": "2025-09-15 03:20:46.887130", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.918801", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.0004077693447470665, "timestamp": "2025-09-15 03:20:46.921088", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.951079", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.002524502808228135, "timestamp": "2025-09-15 03:20:46.957439", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:46.987001", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.003516556229442358, "timestamp": "2025-09-15 03:20:47.010613", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.042252", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.003348279744386673, "timestamp": "2025-09-15 03:20:47.052410", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.085305", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.0006031015072949231, "timestamp": "2025-09-15 03:20:47.091458", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.123623", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.000745028315577656, "timestamp": "2025-09-15 03:20:47.125805", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.155477", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.001737898332066834, "timestamp": "2025-09-15 03:20:47.180130", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:47.211496", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.023937616497278214, "timestamp": "2025-09-15 03:20:47.213548", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.250347", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.043688591569662094, "timestamp": "2025-09-15 03:20:47.252397", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.283446", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.02030237577855587, "timestamp": "2025-09-15 03:20:47.285632", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.315678", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.0310269333422184, "timestamp": "2025-09-15 03:20:47.339295", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.369023", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.00030304151005111635, "timestamp": "2025-09-15 03:20:47.372188", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.402167", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.009272119961678982, "timestamp": "2025-09-15 03:20:47.404817", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.435404", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.0034189175348728895, "timestamp": "2025-09-15 03:20:47.437910", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.468080", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.0010578955989331007, "timestamp": "2025-09-15 03:20:47.491793", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.523110", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.012314596213400364, "timestamp": "2025-09-15 03:20:47.525210", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.554945", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.002753838896751404, "timestamp": "2025-09-15 03:20:47.556971", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.587803", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.013189495541155338, "timestamp": "2025-09-15 03:20:47.589892", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.620166", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.00679773697629571, "timestamp": "2025-09-15 03:20:47.643709", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.673709", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.0012394858058542013, "timestamp": "2025-09-15 03:20:47.676300", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.706354", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.008619984611868858, "timestamp": "2025-09-15 03:20:47.708756", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.738318", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.004352693445980549, "timestamp": "2025-09-15 03:20:47.740323", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.770745", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.0032818778418004513, "timestamp": "2025-09-15 03:20:47.794075", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.824938", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.004312444012612104, "timestamp": "2025-09-15 03:20:47.827045", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:47.856827", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.003813502611592412, "timestamp": "2025-09-15 03:20:47.858927", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.889682", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.009257403202354908, "timestamp": "2025-09-15 03:20:47.892233", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.923309", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.0016402419423684478, "timestamp": "2025-09-15 03:20:47.946759", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:47.976831", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.0017574775265529752, "timestamp": "2025-09-15 03:20:47.978838", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.008313", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.05542900040745735, "timestamp": "2025-09-15 03:20:48.010251", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.039988", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.04872005432844162, "timestamp": "2025-09-15 03:20:48.042069", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.071859", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.0038178430404514074, "timestamp": "2025-09-15 03:20:48.095431", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.126232", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.00633536372333765, "timestamp": "2025-09-15 03:20:48.128206", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.157544", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.003921452444046736, "timestamp": "2025-09-15 03:20:48.159710", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:48.189801", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.005446175578981638, "timestamp": "2025-09-15 03:20:48.191914", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:48.221993", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.027580486610531807, "timestamp": "2025-09-15 03:20:48.245276", "step": 1824, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:48.980456", "step": 1824, "epoch": 2 }, { "type": "pplx", "content": 84274127.79866508, "timestamp": "2025-09-15 03:20:48.982568", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.010779", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.006789560429751873, "timestamp": "2025-09-15 03:20:49.012856", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.043087", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.00810826662927866, "timestamp": "2025-09-15 03:20:49.045200", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:49.076599", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.003023170167580247, "timestamp": "2025-09-15 03:20:49.079771", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:49.110409", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.016775641590356827, "timestamp": "2025-09-15 03:20:49.134122", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:49.163833", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.0007950032013468444, "timestamp": "2025-09-15 03:20:49.165954", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.196133", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.04435531049966812, "timestamp": "2025-09-15 03:20:49.198152", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:49.228427", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.0005618184222839773, "timestamp": "2025-09-15 03:20:49.230466", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.260564", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.002401631325483322, "timestamp": "2025-09-15 03:20:49.284106", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.314240", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.0023109540343284607, "timestamp": "2025-09-15 03:20:49.316186", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:49.346486", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.007046803366392851, "timestamp": "2025-09-15 03:20:49.348641", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:49.397598", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.040310557931661606, "timestamp": "2025-09-15 03:20:49.399958", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.430018", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 0.013268978334963322, "timestamp": "2025-09-15 03:20:49.453512", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.483351", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 0.011992714367806911, "timestamp": "2025-09-15 03:20:49.485576", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.515324", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 0.004671222064644098, "timestamp": "2025-09-15 03:20:49.517338", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.546913", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 0.06716583669185638, "timestamp": "2025-09-15 03:20:49.549067", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.578882", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 0.020287616178393364, "timestamp": "2025-09-15 03:20:49.602300", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:49.631774", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.007350075524300337, "timestamp": "2025-09-15 03:20:49.633800", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.663850", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.001616068184375763, "timestamp": "2025-09-15 03:20:49.665806", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.696464", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 0.00043574132723733783, "timestamp": "2025-09-15 03:20:49.698764", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.728942", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 0.04232177138328552, "timestamp": "2025-09-15 03:20:49.752383", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.782171", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 0.01541460957378149, "timestamp": "2025-09-15 03:20:49.784140", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.815011", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 0.0018699252977967262, "timestamp": "2025-09-15 03:20:49.817076", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.847003", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.015663478523492813, "timestamp": "2025-09-15 03:20:49.849051", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.879032", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.029345327988266945, "timestamp": "2025-09-15 03:20:49.902469", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.933206", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 0.016897190362215042, "timestamp": "2025-09-15 03:20:49.935474", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.965530", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 0.015935799106955528, "timestamp": "2025-09-15 03:20:49.967846", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:49.997627", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.010694613680243492, "timestamp": "2025-09-15 03:20:50.000188", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.029852", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.0030253021977841854, "timestamp": "2025-09-15 03:20:50.053739", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.084010", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.022218191996216774, "timestamp": "2025-09-15 03:20:50.086116", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.116063", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 0.007684533949941397, "timestamp": "2025-09-15 03:20:50.118237", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.149110", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 0.034298017621040344, "timestamp": "2025-09-15 03:20:50.151343", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.181171", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 0.04699214920401573, "timestamp": "2025-09-15 03:20:50.204608", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.234970", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.008235123939812183, "timestamp": "2025-09-15 03:20:50.237249", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.267000", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 0.012289443984627724, "timestamp": "2025-09-15 03:20:50.269110", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.298189", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 0.01189574133604765, "timestamp": "2025-09-15 03:20:50.300413", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.330532", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 0.008656212128698826, "timestamp": "2025-09-15 03:20:50.353987", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.384209", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.00650829216465354, "timestamp": "2025-09-15 03:20:50.386467", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.416154", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.022483665496110916, "timestamp": "2025-09-15 03:20:50.418519", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.449524", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.005312159191817045, "timestamp": "2025-09-15 03:20:50.451704", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.482022", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.015590175986289978, "timestamp": "2025-09-15 03:20:50.505977", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.536742", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.015565576031804085, "timestamp": "2025-09-15 03:20:50.538965", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.569106", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 0.01493970025330782, "timestamp": "2025-09-15 03:20:50.571767", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.602732", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 0.006512957159429789, "timestamp": "2025-09-15 03:20:50.605205", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.634820", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 0.017866162583231926, "timestamp": "2025-09-15 03:20:50.658425", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.688404", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 0.00460231676697731, "timestamp": "2025-09-15 03:20:50.690727", "step": 1869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.721143", "step": 1869, "epoch": 3 }, { "type": "loss", "content": 0.020427634939551353, "timestamp": "2025-09-15 03:20:50.723349", "step": 1870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.754016", "step": 1870, "epoch": 3 }, { "type": "loss", "content": 0.01348375715315342, "timestamp": "2025-09-15 03:20:50.756298", "step": 1871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.785901", "step": 1871, "epoch": 3 }, { "type": "loss", "content": 0.006125123240053654, "timestamp": "2025-09-15 03:20:50.809991", "step": 1872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.840723", "step": 1872, "epoch": 3 }, { "type": "loss", "content": 0.010490350425243378, "timestamp": "2025-09-15 03:20:50.843088", "step": 1873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.872991", "step": 1873, "epoch": 3 }, { "type": "loss", "content": 0.017887888476252556, "timestamp": "2025-09-15 03:20:50.875133", "step": 1874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.904712", "step": 1874, "epoch": 3 }, { "type": "loss", "content": 0.00924895703792572, "timestamp": "2025-09-15 03:20:50.907104", "step": 1875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:50.937259", "step": 1875, "epoch": 3 }, { "type": "loss", "content": 0.005214744247496128, "timestamp": "2025-09-15 03:20:50.961010", "step": 1876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:50.991140", "step": 1876, "epoch": 3 }, { "type": "loss", "content": 0.009649758227169514, "timestamp": "2025-09-15 03:20:50.993549", "step": 1877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:51.023860", "step": 1877, "epoch": 3 }, { "type": "loss", "content": 0.0009704649564810097, "timestamp": "2025-09-15 03:20:51.026196", "step": 1878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:51.057399", "step": 1878, "epoch": 3 }, { "type": "loss", "content": 0.004649874288588762, "timestamp": "2025-09-15 03:20:51.059650", "step": 1879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:51.090033", "step": 1879, "epoch": 3 }, { "type": "loss", "content": 0.0027771240565925837, "timestamp": "2025-09-15 03:20:51.113694", "step": 1880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:51.144361", "step": 1880, "epoch": 3 }, { "type": "loss", "content": 0.007548298221081495, "timestamp": "2025-09-15 03:20:51.146866", "step": 1881, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:51.852012", "step": 1881, "epoch": 3 }, { "type": "pplx", "content": 79061968.0886773, "timestamp": "2025-09-15 03:20:51.854071", "step": 1881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:51.882175", "step": 1881, "epoch": 3 }, { "type": "loss", "content": 0.0009704644908197224, "timestamp": "2025-09-15 03:20:51.884140", "step": 1882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:51.913876", "step": 1882, "epoch": 3 }, { "type": "loss", "content": 0.001010966137982905, "timestamp": "2025-09-15 03:20:51.915862", "step": 1883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:51.946098", "step": 1883, "epoch": 3 }, { "type": "loss", "content": 0.011982734315097332, "timestamp": "2025-09-15 03:20:51.969611", "step": 1884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.000563", "step": 1884, "epoch": 3 }, { "type": "loss", "content": 0.0003622773219831288, "timestamp": "2025-09-15 03:20:52.002484", "step": 1885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.031979", "step": 1885, "epoch": 3 }, { "type": "loss", "content": 0.0402422733604908, "timestamp": "2025-09-15 03:20:52.034011", "step": 1886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.063552", "step": 1886, "epoch": 3 }, { "type": "loss", "content": 0.015572545118629932, "timestamp": "2025-09-15 03:20:52.065598", "step": 1887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:52.095203", "step": 1887, "epoch": 3 }, { "type": "loss", "content": 0.02413473092019558, "timestamp": "2025-09-15 03:20:52.118638", "step": 1888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.148362", "step": 1888, "epoch": 3 }, { "type": "loss", "content": 0.010828151367604733, "timestamp": "2025-09-15 03:20:52.150580", "step": 1889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.181421", "step": 1889, "epoch": 3 }, { "type": "loss", "content": 0.010638976469635963, "timestamp": "2025-09-15 03:20:52.183846", "step": 1890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.214797", "step": 1890, "epoch": 3 }, { "type": "loss", "content": 0.006136444862931967, "timestamp": "2025-09-15 03:20:52.218356", "step": 1891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.248239", "step": 1891, "epoch": 3 }, { "type": "loss", "content": 0.005477941129356623, "timestamp": "2025-09-15 03:20:52.271651", "step": 1892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.301664", "step": 1892, "epoch": 3 }, { "type": "loss", "content": 0.009350026026368141, "timestamp": "2025-09-15 03:20:52.303565", "step": 1893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.333318", "step": 1893, "epoch": 3 }, { "type": "loss", "content": 0.03536457568407059, "timestamp": "2025-09-15 03:20:52.335236", "step": 1894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.365100", "step": 1894, "epoch": 3 }, { "type": "loss", "content": 0.0048360563814640045, "timestamp": "2025-09-15 03:20:52.367179", "step": 1895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.397509", "step": 1895, "epoch": 3 }, { "type": "loss", "content": 0.011292965151369572, "timestamp": "2025-09-15 03:20:52.421025", "step": 1896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.453347", "step": 1896, "epoch": 3 }, { "type": "loss", "content": 0.0005236997385509312, "timestamp": "2025-09-15 03:20:52.455319", "step": 1897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.485482", "step": 1897, "epoch": 3 }, { "type": "loss", "content": 0.001544496393762529, "timestamp": "2025-09-15 03:20:52.487561", "step": 1898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.517393", "step": 1898, "epoch": 3 }, { "type": "loss", "content": 0.0006119143799878657, "timestamp": "2025-09-15 03:20:52.519281", "step": 1899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.551270", "step": 1899, "epoch": 3 }, { "type": "loss", "content": 0.04506358131766319, "timestamp": "2025-09-15 03:20:52.574764", "step": 1900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.604741", "step": 1900, "epoch": 3 }, { "type": "loss", "content": 0.0015903854509815574, "timestamp": "2025-09-15 03:20:52.606747", "step": 1901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.636736", "step": 1901, "epoch": 3 }, { "type": "loss", "content": 0.0056643313728272915, "timestamp": "2025-09-15 03:20:52.638620", "step": 1902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.668361", "step": 1902, "epoch": 3 }, { "type": "loss", "content": 0.012327468022704124, "timestamp": "2025-09-15 03:20:52.670561", "step": 1903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.700271", "step": 1903, "epoch": 3 }, { "type": "loss", "content": 0.03373425453901291, "timestamp": "2025-09-15 03:20:52.723818", "step": 1904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.754999", "step": 1904, "epoch": 3 }, { "type": "loss", "content": 0.0006273844628594816, "timestamp": "2025-09-15 03:20:52.757167", "step": 1905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.786773", "step": 1905, "epoch": 3 }, { "type": "loss", "content": 0.03912340849637985, "timestamp": "2025-09-15 03:20:52.788766", "step": 1906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:52.819254", "step": 1906, "epoch": 3 }, { "type": "loss", "content": 0.0008735408773645759, "timestamp": "2025-09-15 03:20:52.821325", "step": 1907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.851495", "step": 1907, "epoch": 3 }, { "type": "loss", "content": 0.03705182299017906, "timestamp": "2025-09-15 03:20:52.874957", "step": 1908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.904665", "step": 1908, "epoch": 3 }, { "type": "loss", "content": 0.011124390177428722, "timestamp": "2025-09-15 03:20:52.906745", "step": 1909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:52.936488", "step": 1909, "epoch": 3 }, { "type": "loss", "content": 0.014945329166948795, "timestamp": "2025-09-15 03:20:52.938666", "step": 1910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:52.969056", "step": 1910, "epoch": 3 }, { "type": "loss", "content": 0.013663768768310547, "timestamp": "2025-09-15 03:20:52.971198", "step": 1911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.001105", "step": 1911, "epoch": 3 }, { "type": "loss", "content": 0.007298811338841915, "timestamp": "2025-09-15 03:20:53.024486", "step": 1912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.056321", "step": 1912, "epoch": 3 }, { "type": "loss", "content": 0.00904295314103365, "timestamp": "2025-09-15 03:20:53.058356", "step": 1913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:53.088290", "step": 1913, "epoch": 3 }, { "type": "loss", "content": 0.008594379760324955, "timestamp": "2025-09-15 03:20:53.090518", "step": 1914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.120560", "step": 1914, "epoch": 3 }, { "type": "loss", "content": 0.020318692550063133, "timestamp": "2025-09-15 03:20:53.122652", "step": 1915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.152315", "step": 1915, "epoch": 3 }, { "type": "loss", "content": 0.006535095628350973, "timestamp": "2025-09-15 03:20:53.175587", "step": 1916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.205024", "step": 1916, "epoch": 3 }, { "type": "loss", "content": 0.003774958895519376, "timestamp": "2025-09-15 03:20:53.207151", "step": 1917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.236931", "step": 1917, "epoch": 3 }, { "type": "loss", "content": 0.019767671823501587, "timestamp": "2025-09-15 03:20:53.239046", "step": 1918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.269131", "step": 1918, "epoch": 3 }, { "type": "loss", "content": 0.0036115716211497784, "timestamp": "2025-09-15 03:20:53.271448", "step": 1919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.301092", "step": 1919, "epoch": 3 }, { "type": "loss", "content": 0.02624139003455639, "timestamp": "2025-09-15 03:20:53.324534", "step": 1920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.354373", "step": 1920, "epoch": 3 }, { "type": "loss", "content": 0.007259145379066467, "timestamp": "2025-09-15 03:20:53.356475", "step": 1921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.387618", "step": 1921, "epoch": 3 }, { "type": "loss", "content": 0.00768707599490881, "timestamp": "2025-09-15 03:20:53.390468", "step": 1922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.420646", "step": 1922, "epoch": 3 }, { "type": "loss", "content": 0.0268842875957489, "timestamp": "2025-09-15 03:20:53.423167", "step": 1923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:53.455453", "step": 1923, "epoch": 3 }, { "type": "loss", "content": 0.01972857303917408, "timestamp": "2025-09-15 03:20:53.478851", "step": 1924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.508675", "step": 1924, "epoch": 3 }, { "type": "loss", "content": 0.022408347576856613, "timestamp": "2025-09-15 03:20:53.511847", "step": 1925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:53.541430", "step": 1925, "epoch": 3 }, { "type": "loss", "content": 0.02350449375808239, "timestamp": "2025-09-15 03:20:53.543472", "step": 1926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.573169", "step": 1926, "epoch": 3 }, { "type": "loss", "content": 0.002779848873615265, "timestamp": "2025-09-15 03:20:53.575300", "step": 1927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.605608", "step": 1927, "epoch": 3 }, { "type": "loss", "content": 0.006933249067515135, "timestamp": "2025-09-15 03:20:53.629059", "step": 1928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.658800", "step": 1928, "epoch": 3 }, { "type": "loss", "content": 0.01789715513586998, "timestamp": "2025-09-15 03:20:53.660814", "step": 1929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.691321", "step": 1929, "epoch": 3 }, { "type": "loss", "content": 0.049555934965610504, "timestamp": "2025-09-15 03:20:53.693338", "step": 1930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.723340", "step": 1930, "epoch": 3 }, { "type": "loss", "content": 0.009596183896064758, "timestamp": "2025-09-15 03:20:53.725364", "step": 1931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.755034", "step": 1931, "epoch": 3 }, { "type": "loss", "content": 0.004284258931875229, "timestamp": "2025-09-15 03:20:53.778474", "step": 1932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.807925", "step": 1932, "epoch": 3 }, { "type": "loss", "content": 0.001384135102853179, "timestamp": "2025-09-15 03:20:53.810121", "step": 1933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.839919", "step": 1933, "epoch": 3 }, { "type": "loss", "content": 0.00989693682640791, "timestamp": "2025-09-15 03:20:53.842138", "step": 1934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.871865", "step": 1934, "epoch": 3 }, { "type": "loss", "content": 0.0024838608223944902, "timestamp": "2025-09-15 03:20:53.873925", "step": 1935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.903458", "step": 1935, "epoch": 3 }, { "type": "loss", "content": 0.007661298383027315, "timestamp": "2025-09-15 03:20:53.927356", "step": 1936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.957124", "step": 1936, "epoch": 3 }, { "type": "loss", "content": 0.0049940976314246655, "timestamp": "2025-09-15 03:20:53.959512", "step": 1937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:53.988953", "step": 1937, "epoch": 3 }, { "type": "loss", "content": 0.005934380926191807, "timestamp": "2025-09-15 03:20:53.991356", "step": 1938, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:54.705074", "step": 1938, "epoch": 3 }, { "type": "pplx", "content": 52201116.26590185, "timestamp": "2025-09-15 03:20:54.707300", "step": 1938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:54.735523", "step": 1938, "epoch": 3 }, { "type": "loss", "content": 0.03174247592687607, "timestamp": "2025-09-15 03:20:54.737647", "step": 1939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:54.767669", "step": 1939, "epoch": 3 }, { "type": "loss", "content": 0.002668730914592743, "timestamp": "2025-09-15 03:20:54.791277", "step": 1940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:54.821190", "step": 1940, "epoch": 3 }, { "type": "loss", "content": 0.006054393015801907, "timestamp": "2025-09-15 03:20:54.823585", "step": 1941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:54.854030", "step": 1941, "epoch": 3 }, { "type": "loss", "content": 0.009095175191760063, "timestamp": "2025-09-15 03:20:54.856439", "step": 1942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:54.886495", "step": 1942, "epoch": 3 }, { "type": "loss", "content": 0.029130371287465096, "timestamp": "2025-09-15 03:20:54.888255", "step": 1943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:20:54.917756", "step": 1943, "epoch": 3 }, { "type": "loss", "content": 0.040686529129743576, "timestamp": "2025-09-15 03:20:54.941375", "step": 1944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:54.971974", "step": 1944, "epoch": 3 }, { "type": "loss", "content": 0.0014195304829627275, "timestamp": "2025-09-15 03:20:54.974304", "step": 1945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.005882", "step": 1945, "epoch": 3 }, { "type": "loss", "content": 0.009091906249523163, "timestamp": "2025-09-15 03:20:55.008082", "step": 1946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.038113", "step": 1946, "epoch": 3 }, { "type": "loss", "content": 0.0034668713342398405, "timestamp": "2025-09-15 03:20:55.040294", "step": 1947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.070322", "step": 1947, "epoch": 3 }, { "type": "loss", "content": 0.0020099026151001453, "timestamp": "2025-09-15 03:20:55.093940", "step": 1948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.124139", "step": 1948, "epoch": 3 }, { "type": "loss", "content": 0.006301121320575476, "timestamp": "2025-09-15 03:20:55.126148", "step": 1949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.156304", "step": 1949, "epoch": 3 }, { "type": "loss", "content": 0.008950197137892246, "timestamp": "2025-09-15 03:20:55.159855", "step": 1950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.190260", "step": 1950, "epoch": 3 }, { "type": "loss", "content": 0.010447906330227852, "timestamp": "2025-09-15 03:20:55.192278", "step": 1951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.222749", "step": 1951, "epoch": 3 }, { "type": "loss", "content": 0.016328440979123116, "timestamp": "2025-09-15 03:20:55.246128", "step": 1952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.275743", "step": 1952, "epoch": 3 }, { "type": "loss", "content": 0.0013112931046634912, "timestamp": "2025-09-15 03:20:55.277433", "step": 1953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.306937", "step": 1953, "epoch": 3 }, { "type": "loss", "content": 0.002939864993095398, "timestamp": "2025-09-15 03:20:55.309128", "step": 1954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.338856", "step": 1954, "epoch": 3 }, { "type": "loss", "content": 0.04834269359707832, "timestamp": "2025-09-15 03:20:55.340701", "step": 1955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.370272", "step": 1955, "epoch": 3 }, { "type": "loss", "content": 0.017391914501786232, "timestamp": "2025-09-15 03:20:55.393722", "step": 1956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.424170", "step": 1956, "epoch": 3 }, { "type": "loss", "content": 0.009930714964866638, "timestamp": "2025-09-15 03:20:55.426034", "step": 1957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.455602", "step": 1957, "epoch": 3 }, { "type": "loss", "content": 0.004469075705856085, "timestamp": "2025-09-15 03:20:55.458941", "step": 1958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.488991", "step": 1958, "epoch": 3 }, { "type": "loss", "content": 0.0024031256325542927, "timestamp": "2025-09-15 03:20:55.491420", "step": 1959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.521239", "step": 1959, "epoch": 3 }, { "type": "loss", "content": 0.03323809802532196, "timestamp": "2025-09-15 03:20:55.544528", "step": 1960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.574723", "step": 1960, "epoch": 3 }, { "type": "loss", "content": 0.002740835305303335, "timestamp": "2025-09-15 03:20:55.576837", "step": 1961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.607287", "step": 1961, "epoch": 3 }, { "type": "loss", "content": 0.0028987224213778973, "timestamp": "2025-09-15 03:20:55.609454", "step": 1962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.639352", "step": 1962, "epoch": 3 }, { "type": "loss", "content": 0.020512523129582405, "timestamp": "2025-09-15 03:20:55.641434", "step": 1963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.671263", "step": 1963, "epoch": 3 }, { "type": "loss", "content": 0.04198675602674484, "timestamp": "2025-09-15 03:20:55.694909", "step": 1964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.724696", "step": 1964, "epoch": 3 }, { "type": "loss", "content": 0.006624828092753887, "timestamp": "2025-09-15 03:20:55.727072", "step": 1965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.757139", "step": 1965, "epoch": 3 }, { "type": "loss", "content": 0.002662785816937685, "timestamp": "2025-09-15 03:20:55.759359", "step": 1966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.790281", "step": 1966, "epoch": 3 }, { "type": "loss", "content": 0.003230338217690587, "timestamp": "2025-09-15 03:20:55.792494", "step": 1967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.822213", "step": 1967, "epoch": 3 }, { "type": "loss", "content": 0.017028162255883217, "timestamp": "2025-09-15 03:20:55.845573", "step": 1968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.875760", "step": 1968, "epoch": 3 }, { "type": "loss", "content": 0.0020218912977725267, "timestamp": "2025-09-15 03:20:55.877705", "step": 1969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:55.908857", "step": 1969, "epoch": 3 }, { "type": "loss", "content": 0.010596668347716331, "timestamp": "2025-09-15 03:20:55.910785", "step": 1970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:55.941058", "step": 1970, "epoch": 3 }, { "type": "loss", "content": 0.0063810343854129314, "timestamp": "2025-09-15 03:20:55.943094", "step": 1971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:55.974331", "step": 1971, "epoch": 3 }, { "type": "loss", "content": 0.024318158626556396, "timestamp": "2025-09-15 03:20:55.997815", "step": 1972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.027461", "step": 1972, "epoch": 3 }, { "type": "loss", "content": 0.014820395037531853, "timestamp": "2025-09-15 03:20:56.029277", "step": 1973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.059371", "step": 1973, "epoch": 3 }, { "type": "loss", "content": 0.007273904979228973, "timestamp": "2025-09-15 03:20:56.061585", "step": 1974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.092700", "step": 1974, "epoch": 3 }, { "type": "loss", "content": 0.0198097825050354, "timestamp": "2025-09-15 03:20:56.094906", "step": 1975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.125797", "step": 1975, "epoch": 3 }, { "type": "loss", "content": 0.010190504603087902, "timestamp": "2025-09-15 03:20:56.149316", "step": 1976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.179098", "step": 1976, "epoch": 3 }, { "type": "loss", "content": 0.005385191645473242, "timestamp": "2025-09-15 03:20:56.180907", "step": 1977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.210819", "step": 1977, "epoch": 3 }, { "type": "loss", "content": 0.011977910064160824, "timestamp": "2025-09-15 03:20:56.213146", "step": 1978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.243537", "step": 1978, "epoch": 3 }, { "type": "loss", "content": 0.013383844867348671, "timestamp": "2025-09-15 03:20:56.245460", "step": 1979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.275859", "step": 1979, "epoch": 3 }, { "type": "loss", "content": 0.0016832282999530435, "timestamp": "2025-09-15 03:20:56.299044", "step": 1980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.329750", "step": 1980, "epoch": 3 }, { "type": "loss", "content": 0.004006501287221909, "timestamp": "2025-09-15 03:20:56.331897", "step": 1981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.362176", "step": 1981, "epoch": 3 }, { "type": "loss", "content": 0.009227439761161804, "timestamp": "2025-09-15 03:20:56.364032", "step": 1982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.394255", "step": 1982, "epoch": 3 }, { "type": "loss", "content": 0.0073464675806462765, "timestamp": "2025-09-15 03:20:56.396433", "step": 1983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.428168", "step": 1983, "epoch": 3 }, { "type": "loss", "content": 0.0061258794739842415, "timestamp": "2025-09-15 03:20:56.451663", "step": 1984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.482146", "step": 1984, "epoch": 3 }, { "type": "loss", "content": 0.0024023440200835466, "timestamp": "2025-09-15 03:20:56.484190", "step": 1985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.514611", "step": 1985, "epoch": 3 }, { "type": "loss", "content": 0.001554057002067566, "timestamp": "2025-09-15 03:20:56.516678", "step": 1986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:56.549468", "step": 1986, "epoch": 3 }, { "type": "loss", "content": 0.02349625527858734, "timestamp": "2025-09-15 03:20:56.551484", "step": 1987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:56.581914", "step": 1987, "epoch": 3 }, { "type": "loss", "content": 0.015915708616375923, "timestamp": "2025-09-15 03:20:56.605227", "step": 1988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.635639", "step": 1988, "epoch": 3 }, { "type": "loss", "content": 0.0070765577256679535, "timestamp": "2025-09-15 03:20:56.637897", "step": 1989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.667794", "step": 1989, "epoch": 3 }, { "type": "loss", "content": 0.0020205960609018803, "timestamp": "2025-09-15 03:20:56.669569", "step": 1990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.699597", "step": 1990, "epoch": 3 }, { "type": "loss", "content": 0.03761409968137741, "timestamp": "2025-09-15 03:20:56.701466", "step": 1991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.732875", "step": 1991, "epoch": 3 }, { "type": "loss", "content": 0.027002081274986267, "timestamp": "2025-09-15 03:20:56.756203", "step": 1992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:56.786469", "step": 1992, "epoch": 3 }, { "type": "loss", "content": 0.05631239339709282, "timestamp": "2025-09-15 03:20:56.788464", "step": 1993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.818334", "step": 1993, "epoch": 3 }, { "type": "loss", "content": 0.009488861076533794, "timestamp": "2025-09-15 03:20:56.820565", "step": 1994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:56.851031", "step": 1994, "epoch": 3 }, { "type": "loss", "content": 0.0019661146216094494, "timestamp": "2025-09-15 03:20:56.853077", "step": 1995, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:20:57.584519", "step": 1995, "epoch": 3 }, { "type": "pplx", "content": 52256625.34364043, "timestamp": "2025-09-15 03:20:57.587161", "step": 1995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:57.616644", "step": 1995, "epoch": 3 }, { "type": "loss", "content": 0.0030057637486606836, "timestamp": "2025-09-15 03:20:57.639958", "step": 1996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:57.670232", "step": 1996, "epoch": 3 }, { "type": "loss", "content": 0.0027218328323215246, "timestamp": "2025-09-15 03:20:57.672073", "step": 1997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:20:57.702246", "step": 1997, "epoch": 3 }, { "type": "loss", "content": 0.012235582806169987, "timestamp": "2025-09-15 03:20:57.704160", "step": 1998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:20:57.734468", "step": 1998, "epoch": 3 }, { "type": "loss", "content": 0.013591994531452656, "timestamp": "2025-09-15 03:20:57.736468", "step": 1999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:20:57.766900", "step": 1999, "epoch": 3 }, { "type": "loss", "content": 0.013293951749801636, "timestamp": "2025-09-15 03:20:57.790317", "step": 2000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-15 03:21:04.244838", "step": 2000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.279604", "step": 2000, "epoch": 3 }, { "type": "loss", "content": 0.01585422083735466, "timestamp": "2025-09-15 03:21:04.281723", "step": 2001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.313622", "step": 2001, "epoch": 3 }, { "type": "loss", "content": 0.001507807755842805, "timestamp": "2025-09-15 03:21:04.315694", "step": 2002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.346442", "step": 2002, "epoch": 3 }, { "type": "loss", "content": 0.0021584215573966503, "timestamp": "2025-09-15 03:21:04.348477", "step": 2003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.378328", "step": 2003, "epoch": 3 }, { "type": "loss", "content": 0.04452117905020714, "timestamp": "2025-09-15 03:21:04.401911", "step": 2004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.432932", "step": 2004, "epoch": 3 }, { "type": "loss", "content": 0.0012962737819179893, "timestamp": "2025-09-15 03:21:04.434908", "step": 2005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:04.465187", "step": 2005, "epoch": 3 }, { "type": "loss", "content": 0.02925196662545204, "timestamp": "2025-09-15 03:21:04.467299", "step": 2006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.497516", "step": 2006, "epoch": 3 }, { "type": "loss", "content": 0.006457092706114054, "timestamp": "2025-09-15 03:21:04.499404", "step": 2007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.530176", "step": 2007, "epoch": 3 }, { "type": "loss", "content": 0.014313125051558018, "timestamp": "2025-09-15 03:21:04.553819", "step": 2008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.585146", "step": 2008, "epoch": 3 }, { "type": "loss", "content": 0.007027280982583761, "timestamp": "2025-09-15 03:21:04.587228", "step": 2009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:04.618126", "step": 2009, "epoch": 3 }, { "type": "loss", "content": 0.015289964154362679, "timestamp": "2025-09-15 03:21:04.620200", "step": 2010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.650770", "step": 2010, "epoch": 3 }, { "type": "loss", "content": 0.006559719797223806, "timestamp": "2025-09-15 03:21:04.652852", "step": 2011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.684000", "step": 2011, "epoch": 3 }, { "type": "loss", "content": 0.004909235052764416, "timestamp": "2025-09-15 03:21:04.707783", "step": 2012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.738585", "step": 2012, "epoch": 3 }, { "type": "loss", "content": 0.002195314271375537, "timestamp": "2025-09-15 03:21:04.740815", "step": 2013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.771585", "step": 2013, "epoch": 3 }, { "type": "loss", "content": 0.003565679071471095, "timestamp": "2025-09-15 03:21:04.773869", "step": 2014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.805568", "step": 2014, "epoch": 3 }, { "type": "loss", "content": 0.028394032269716263, "timestamp": "2025-09-15 03:21:04.807914", "step": 2015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.837639", "step": 2015, "epoch": 3 }, { "type": "loss", "content": 0.012641467154026031, "timestamp": "2025-09-15 03:21:04.861599", "step": 2016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.892180", "step": 2016, "epoch": 3 }, { "type": "loss", "content": 0.005878259893506765, "timestamp": "2025-09-15 03:21:04.894417", "step": 2017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:04.925315", "step": 2017, "epoch": 3 }, { "type": "loss", "content": 0.011266002431511879, "timestamp": "2025-09-15 03:21:04.927410", "step": 2018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:04.958086", "step": 2018, "epoch": 3 }, { "type": "loss", "content": 0.0023640652652829885, "timestamp": "2025-09-15 03:21:04.960093", "step": 2019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:04.991079", "step": 2019, "epoch": 3 }, { "type": "loss", "content": 0.007986019365489483, "timestamp": "2025-09-15 03:21:05.014635", "step": 2020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.045562", "step": 2020, "epoch": 3 }, { "type": "loss", "content": 0.002925436245277524, "timestamp": "2025-09-15 03:21:05.048057", "step": 2021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.079498", "step": 2021, "epoch": 3 }, { "type": "loss", "content": 0.005862175952643156, "timestamp": "2025-09-15 03:21:05.082027", "step": 2022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.113280", "step": 2022, "epoch": 3 }, { "type": "loss", "content": 0.0018555274000391364, "timestamp": "2025-09-15 03:21:05.115406", "step": 2023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.145494", "step": 2023, "epoch": 3 }, { "type": "loss", "content": 0.01033469382673502, "timestamp": "2025-09-15 03:21:05.168989", "step": 2024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.199517", "step": 2024, "epoch": 3 }, { "type": "loss", "content": 0.016592005267739296, "timestamp": "2025-09-15 03:21:05.201697", "step": 2025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:05.232272", "step": 2025, "epoch": 3 }, { "type": "loss", "content": 0.0015080816810950637, "timestamp": "2025-09-15 03:21:05.234420", "step": 2026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.265099", "step": 2026, "epoch": 3 }, { "type": "loss", "content": 0.002308989642187953, "timestamp": "2025-09-15 03:21:05.267237", "step": 2027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.298232", "step": 2027, "epoch": 3 }, { "type": "loss", "content": 0.005254893563687801, "timestamp": "2025-09-15 03:21:05.322029", "step": 2028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.352870", "step": 2028, "epoch": 3 }, { "type": "loss", "content": 0.02585950866341591, "timestamp": "2025-09-15 03:21:05.355003", "step": 2029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.385641", "step": 2029, "epoch": 3 }, { "type": "loss", "content": 0.017035724595189095, "timestamp": "2025-09-15 03:21:05.387615", "step": 2030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.418267", "step": 2030, "epoch": 3 }, { "type": "loss", "content": 0.0030646901577711105, "timestamp": "2025-09-15 03:21:05.420386", "step": 2031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.451148", "step": 2031, "epoch": 3 }, { "type": "loss", "content": 0.005517587065696716, "timestamp": "2025-09-15 03:21:05.474646", "step": 2032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:05.505339", "step": 2032, "epoch": 3 }, { "type": "loss", "content": 0.015570155344903469, "timestamp": "2025-09-15 03:21:05.507452", "step": 2033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.537443", "step": 2033, "epoch": 3 }, { "type": "loss", "content": 0.0066670882515609264, "timestamp": "2025-09-15 03:21:05.539505", "step": 2034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.569512", "step": 2034, "epoch": 3 }, { "type": "loss", "content": 0.0035194784868508577, "timestamp": "2025-09-15 03:21:05.571683", "step": 2035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.602120", "step": 2035, "epoch": 3 }, { "type": "loss", "content": 0.015802303329110146, "timestamp": "2025-09-15 03:21:05.625593", "step": 2036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.658084", "step": 2036, "epoch": 3 }, { "type": "loss", "content": 0.008249717764556408, "timestamp": "2025-09-15 03:21:05.660321", "step": 2037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:05.690915", "step": 2037, "epoch": 3 }, { "type": "loss", "content": 0.02149888686835766, "timestamp": "2025-09-15 03:21:05.692955", "step": 2038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:05.723767", "step": 2038, "epoch": 3 }, { "type": "loss", "content": 0.004375663120299578, "timestamp": "2025-09-15 03:21:05.726164", "step": 2039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.757205", "step": 2039, "epoch": 3 }, { "type": "loss", "content": 0.0018813589122146368, "timestamp": "2025-09-15 03:21:05.780917", "step": 2040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.811556", "step": 2040, "epoch": 3 }, { "type": "loss", "content": 0.012875386513769627, "timestamp": "2025-09-15 03:21:05.813683", "step": 2041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.843697", "step": 2041, "epoch": 3 }, { "type": "loss", "content": 0.0009363238350488245, "timestamp": "2025-09-15 03:21:05.846413", "step": 2042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.882725", "step": 2042, "epoch": 3 }, { "type": "loss", "content": 0.013507531024515629, "timestamp": "2025-09-15 03:21:05.884925", "step": 2043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.915336", "step": 2043, "epoch": 3 }, { "type": "loss", "content": 0.02033652924001217, "timestamp": "2025-09-15 03:21:05.939133", "step": 2044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:05.969433", "step": 2044, "epoch": 3 }, { "type": "loss", "content": 0.004257784225046635, "timestamp": "2025-09-15 03:21:05.971495", "step": 2045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:06.001563", "step": 2045, "epoch": 3 }, { "type": "loss", "content": 0.04417576640844345, "timestamp": "2025-09-15 03:21:06.003604", "step": 2046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:06.033828", "step": 2046, "epoch": 3 }, { "type": "loss", "content": 0.01268736831843853, "timestamp": "2025-09-15 03:21:06.035902", "step": 2047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:06.066219", "step": 2047, "epoch": 3 }, { "type": "loss", "content": 0.0059195528738200665, "timestamp": "2025-09-15 03:21:06.090609", "step": 2048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:06.122957", "step": 2048, "epoch": 3 }, { "type": "loss", "content": 0.002530893078073859, "timestamp": "2025-09-15 03:21:06.124928", "step": 2049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:06.155336", "step": 2049, "epoch": 3 }, { "type": "loss", "content": 0.007208889815956354, "timestamp": "2025-09-15 03:21:06.157413", "step": 2050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:06.188975", "step": 2050, "epoch": 3 }, { "type": "loss", "content": 0.007339324336498976, "timestamp": "2025-09-15 03:21:06.191223", "step": 2051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:06.222004", "step": 2051, "epoch": 3 }, { "type": "loss", "content": 0.031263567507267, "timestamp": "2025-09-15 03:21:06.245643", "step": 2052, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:06.985869", "step": 2052, "epoch": 3 }, { "type": "pplx", "content": 59827417.800061285, "timestamp": "2025-09-15 03:21:06.988010", "step": 2052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.017171", "step": 2052, "epoch": 3 }, { "type": "loss", "content": 0.004454520996659994, "timestamp": "2025-09-15 03:21:07.019431", "step": 2053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.051489", "step": 2053, "epoch": 3 }, { "type": "loss", "content": 0.003587186336517334, "timestamp": "2025-09-15 03:21:07.053563", "step": 2054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.084326", "step": 2054, "epoch": 3 }, { "type": "loss", "content": 0.005687421653419733, "timestamp": "2025-09-15 03:21:07.086619", "step": 2055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.117312", "step": 2055, "epoch": 3 }, { "type": "loss", "content": 0.0027272473089396954, "timestamp": "2025-09-15 03:21:07.141245", "step": 2056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.172840", "step": 2056, "epoch": 3 }, { "type": "loss", "content": 0.015834230929613113, "timestamp": "2025-09-15 03:21:07.174912", "step": 2057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.205655", "step": 2057, "epoch": 3 }, { "type": "loss", "content": 0.002714097034186125, "timestamp": "2025-09-15 03:21:07.208618", "step": 2058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.238702", "step": 2058, "epoch": 3 }, { "type": "loss", "content": 0.005063068121671677, "timestamp": "2025-09-15 03:21:07.240817", "step": 2059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.271865", "step": 2059, "epoch": 3 }, { "type": "loss", "content": 0.003443613648414612, "timestamp": "2025-09-15 03:21:07.295851", "step": 2060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.326844", "step": 2060, "epoch": 3 }, { "type": "loss", "content": 0.0057801674120128155, "timestamp": "2025-09-15 03:21:07.328860", "step": 2061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:07.360331", "step": 2061, "epoch": 3 }, { "type": "loss", "content": 0.0011709003010764718, "timestamp": "2025-09-15 03:21:07.362481", "step": 2062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:07.392641", "step": 2062, "epoch": 3 }, { "type": "loss", "content": 0.016910651698708534, "timestamp": "2025-09-15 03:21:07.395025", "step": 2063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.425973", "step": 2063, "epoch": 3 }, { "type": "loss", "content": 0.007694550324231386, "timestamp": "2025-09-15 03:21:07.449471", "step": 2064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.481036", "step": 2064, "epoch": 3 }, { "type": "loss", "content": 0.0011027039727196097, "timestamp": "2025-09-15 03:21:07.483245", "step": 2065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.514017", "step": 2065, "epoch": 3 }, { "type": "loss", "content": 0.006830222904682159, "timestamp": "2025-09-15 03:21:07.516068", "step": 2066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:07.546654", "step": 2066, "epoch": 3 }, { "type": "loss", "content": 0.014835420064628124, "timestamp": "2025-09-15 03:21:07.548709", "step": 2067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.579588", "step": 2067, "epoch": 3 }, { "type": "loss", "content": 0.02849721722304821, "timestamp": "2025-09-15 03:21:07.603145", "step": 2068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.634053", "step": 2068, "epoch": 3 }, { "type": "loss", "content": 0.0010745684849098325, "timestamp": "2025-09-15 03:21:07.636202", "step": 2069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.666979", "step": 2069, "epoch": 3 }, { "type": "loss", "content": 0.009703867137432098, "timestamp": "2025-09-15 03:21:07.669452", "step": 2070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:07.700172", "step": 2070, "epoch": 3 }, { "type": "loss", "content": 0.0004812530241906643, "timestamp": "2025-09-15 03:21:07.702361", "step": 2071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.732785", "step": 2071, "epoch": 3 }, { "type": "loss", "content": 0.026970041915774345, "timestamp": "2025-09-15 03:21:07.756546", "step": 2072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.788069", "step": 2072, "epoch": 3 }, { "type": "loss", "content": 0.01335603091865778, "timestamp": "2025-09-15 03:21:07.790299", "step": 2073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.820655", "step": 2073, "epoch": 3 }, { "type": "loss", "content": 0.0024553367402404547, "timestamp": "2025-09-15 03:21:07.822901", "step": 2074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:07.853406", "step": 2074, "epoch": 3 }, { "type": "loss", "content": 0.0027745079714804888, "timestamp": "2025-09-15 03:21:07.855520", "step": 2075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.885728", "step": 2075, "epoch": 3 }, { "type": "loss", "content": 0.00588145712390542, "timestamp": "2025-09-15 03:21:07.909403", "step": 2076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:07.940245", "step": 2076, "epoch": 3 }, { "type": "loss", "content": 0.004154204856604338, "timestamp": "2025-09-15 03:21:07.942235", "step": 2077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:07.973921", "step": 2077, "epoch": 3 }, { "type": "loss", "content": 0.024734104052186012, "timestamp": "2025-09-15 03:21:07.975979", "step": 2078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.006119", "step": 2078, "epoch": 3 }, { "type": "loss", "content": 0.020028170198202133, "timestamp": "2025-09-15 03:21:08.007851", "step": 2079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.038062", "step": 2079, "epoch": 3 }, { "type": "loss", "content": 0.005237225443124771, "timestamp": "2025-09-15 03:21:08.061562", "step": 2080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.091861", "step": 2080, "epoch": 3 }, { "type": "loss", "content": 0.006769266445189714, "timestamp": "2025-09-15 03:21:08.094052", "step": 2081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.124369", "step": 2081, "epoch": 3 }, { "type": "loss", "content": 0.0024314168840646744, "timestamp": "2025-09-15 03:21:08.126294", "step": 2082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.156660", "step": 2082, "epoch": 3 }, { "type": "loss", "content": 0.008758151903748512, "timestamp": "2025-09-15 03:21:08.158628", "step": 2083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.189137", "step": 2083, "epoch": 3 }, { "type": "loss", "content": 0.001061967690475285, "timestamp": "2025-09-15 03:21:08.212826", "step": 2084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.244237", "step": 2084, "epoch": 3 }, { "type": "loss", "content": 0.052610017359256744, "timestamp": "2025-09-15 03:21:08.246563", "step": 2085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.278921", "step": 2085, "epoch": 3 }, { "type": "loss", "content": 0.011462748982012272, "timestamp": "2025-09-15 03:21:08.281078", "step": 2086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.311628", "step": 2086, "epoch": 3 }, { "type": "loss", "content": 0.005782654043287039, "timestamp": "2025-09-15 03:21:08.313898", "step": 2087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.345191", "step": 2087, "epoch": 3 }, { "type": "loss", "content": 0.008149498142302036, "timestamp": "2025-09-15 03:21:08.368821", "step": 2088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.399636", "step": 2088, "epoch": 3 }, { "type": "loss", "content": 0.001337492954917252, "timestamp": "2025-09-15 03:21:08.401922", "step": 2089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:08.432968", "step": 2089, "epoch": 3 }, { "type": "loss", "content": 0.0016679230611771345, "timestamp": "2025-09-15 03:21:08.435273", "step": 2090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.465504", "step": 2090, "epoch": 3 }, { "type": "loss", "content": 0.001318950904533267, "timestamp": "2025-09-15 03:21:08.467623", "step": 2091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.498023", "step": 2091, "epoch": 3 }, { "type": "loss", "content": 0.007607594132423401, "timestamp": "2025-09-15 03:21:08.521602", "step": 2092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.553684", "step": 2092, "epoch": 3 }, { "type": "loss", "content": 0.0025145213585346937, "timestamp": "2025-09-15 03:21:08.555960", "step": 2093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.587335", "step": 2093, "epoch": 3 }, { "type": "loss", "content": 0.002167313126847148, "timestamp": "2025-09-15 03:21:08.592067", "step": 2094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.625875", "step": 2094, "epoch": 3 }, { "type": "loss", "content": 0.0023553003557026386, "timestamp": "2025-09-15 03:21:08.630849", "step": 2095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.664530", "step": 2095, "epoch": 3 }, { "type": "loss", "content": 0.04698661342263222, "timestamp": "2025-09-15 03:21:08.688042", "step": 2096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:08.718577", "step": 2096, "epoch": 3 }, { "type": "loss", "content": 0.0016108545241877437, "timestamp": "2025-09-15 03:21:08.720747", "step": 2097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:08.750979", "step": 2097, "epoch": 3 }, { "type": "loss", "content": 0.05345345288515091, "timestamp": "2025-09-15 03:21:08.753657", "step": 2098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.785157", "step": 2098, "epoch": 3 }, { "type": "loss", "content": 0.017849748954176903, "timestamp": "2025-09-15 03:21:08.787350", "step": 2099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.817721", "step": 2099, "epoch": 3 }, { "type": "loss", "content": 0.005959447007626295, "timestamp": "2025-09-15 03:21:08.841043", "step": 2100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:08.872160", "step": 2100, "epoch": 3 }, { "type": "loss", "content": 0.0007292155059985816, "timestamp": "2025-09-15 03:21:08.874250", "step": 2101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:08.904928", "step": 2101, "epoch": 3 }, { "type": "loss", "content": 0.0012959071900695562, "timestamp": "2025-09-15 03:21:08.906997", "step": 2102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:08.937128", "step": 2102, "epoch": 3 }, { "type": "loss", "content": 0.01766919530928135, "timestamp": "2025-09-15 03:21:08.939101", "step": 2103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:08.969575", "step": 2103, "epoch": 3 }, { "type": "loss", "content": 0.0009708349825814366, "timestamp": "2025-09-15 03:21:08.993311", "step": 2104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:09.023627", "step": 2104, "epoch": 3 }, { "type": "loss", "content": 0.0029763688798993826, "timestamp": "2025-09-15 03:21:09.025585", "step": 2105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:09.055945", "step": 2105, "epoch": 3 }, { "type": "loss", "content": 0.0015365873696282506, "timestamp": "2025-09-15 03:21:09.057913", "step": 2106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:09.089373", "step": 2106, "epoch": 3 }, { "type": "loss", "content": 0.005631761159747839, "timestamp": "2025-09-15 03:21:09.091425", "step": 2107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:09.122067", "step": 2107, "epoch": 3 }, { "type": "loss", "content": 0.008225338533520699, "timestamp": "2025-09-15 03:21:09.145599", "step": 2108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:09.176704", "step": 2108, "epoch": 3 }, { "type": "loss", "content": 0.017039282247424126, "timestamp": "2025-09-15 03:21:09.178813", "step": 2109, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:09.912101", "step": 2109, "epoch": 3 }, { "type": "pplx", "content": 61278679.55301822, "timestamp": "2025-09-15 03:21:09.914296", "step": 2109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:09.943144", "step": 2109, "epoch": 3 }, { "type": "loss", "content": 0.00293324189260602, "timestamp": "2025-09-15 03:21:09.945284", "step": 2110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:09.976065", "step": 2110, "epoch": 3 }, { "type": "loss", "content": 0.0002962352300528437, "timestamp": "2025-09-15 03:21:09.978298", "step": 2111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.008554", "step": 2111, "epoch": 3 }, { "type": "loss", "content": 0.010974375531077385, "timestamp": "2025-09-15 03:21:10.032052", "step": 2112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.062926", "step": 2112, "epoch": 3 }, { "type": "loss", "content": 0.0032788703683763742, "timestamp": "2025-09-15 03:21:10.064861", "step": 2113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.095904", "step": 2113, "epoch": 3 }, { "type": "loss", "content": 0.028654035180807114, "timestamp": "2025-09-15 03:21:10.098187", "step": 2114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.129439", "step": 2114, "epoch": 3 }, { "type": "loss", "content": 0.0012286610435694456, "timestamp": "2025-09-15 03:21:10.131764", "step": 2115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.162138", "step": 2115, "epoch": 3 }, { "type": "loss", "content": 0.0076230731792747974, "timestamp": "2025-09-15 03:21:10.185656", "step": 2116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.216188", "step": 2116, "epoch": 3 }, { "type": "loss", "content": 0.0033180455211549997, "timestamp": "2025-09-15 03:21:10.218154", "step": 2117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.248299", "step": 2117, "epoch": 3 }, { "type": "loss", "content": 0.038955122232437134, "timestamp": "2025-09-15 03:21:10.250437", "step": 2118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.281329", "step": 2118, "epoch": 3 }, { "type": "loss", "content": 0.0011777119943872094, "timestamp": "2025-09-15 03:21:10.283430", "step": 2119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.313569", "step": 2119, "epoch": 3 }, { "type": "loss", "content": 0.0006529755191877484, "timestamp": "2025-09-15 03:21:10.337184", "step": 2120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:10.367902", "step": 2120, "epoch": 3 }, { "type": "loss", "content": 0.0013485082890838385, "timestamp": "2025-09-15 03:21:10.369919", "step": 2121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.400860", "step": 2121, "epoch": 3 }, { "type": "loss", "content": 0.025854647159576416, "timestamp": "2025-09-15 03:21:10.402919", "step": 2122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.433430", "step": 2122, "epoch": 3 }, { "type": "loss", "content": 0.02108658477663994, "timestamp": "2025-09-15 03:21:10.435446", "step": 2123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.465322", "step": 2123, "epoch": 3 }, { "type": "loss", "content": 0.001103846007026732, "timestamp": "2025-09-15 03:21:10.489035", "step": 2124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.519710", "step": 2124, "epoch": 3 }, { "type": "loss", "content": 0.007358514703810215, "timestamp": "2025-09-15 03:21:10.521856", "step": 2125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:10.552503", "step": 2125, "epoch": 3 }, { "type": "loss", "content": 0.005075577646493912, "timestamp": "2025-09-15 03:21:10.554566", "step": 2126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.584500", "step": 2126, "epoch": 3 }, { "type": "loss", "content": 0.007805339992046356, "timestamp": "2025-09-15 03:21:10.586806", "step": 2127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.616428", "step": 2127, "epoch": 3 }, { "type": "loss", "content": 0.013435539789497852, "timestamp": "2025-09-15 03:21:10.639871", "step": 2128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.669886", "step": 2128, "epoch": 3 }, { "type": "loss", "content": 0.01014357153326273, "timestamp": "2025-09-15 03:21:10.672026", "step": 2129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.703349", "step": 2129, "epoch": 3 }, { "type": "loss", "content": 0.0019475395092740655, "timestamp": "2025-09-15 03:21:10.705559", "step": 2130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.735939", "step": 2130, "epoch": 3 }, { "type": "loss", "content": 0.0018576009897515178, "timestamp": "2025-09-15 03:21:10.738032", "step": 2131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.768017", "step": 2131, "epoch": 3 }, { "type": "loss", "content": 0.006272417493164539, "timestamp": "2025-09-15 03:21:10.791484", "step": 2132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.821780", "step": 2132, "epoch": 3 }, { "type": "loss", "content": 0.017416836693882942, "timestamp": "2025-09-15 03:21:10.823819", "step": 2133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:10.854594", "step": 2133, "epoch": 3 }, { "type": "loss", "content": 0.0018346422584727407, "timestamp": "2025-09-15 03:21:10.856629", "step": 2134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.887336", "step": 2134, "epoch": 3 }, { "type": "loss", "content": 0.002072213450446725, "timestamp": "2025-09-15 03:21:10.889639", "step": 2135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:10.920267", "step": 2135, "epoch": 3 }, { "type": "loss", "content": 0.0008718724129721522, "timestamp": "2025-09-15 03:21:10.943527", "step": 2136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:10.974610", "step": 2136, "epoch": 3 }, { "type": "loss", "content": 0.0033810038585215807, "timestamp": "2025-09-15 03:21:10.976851", "step": 2137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.008118", "step": 2137, "epoch": 3 }, { "type": "loss", "content": 0.005350136663764715, "timestamp": "2025-09-15 03:21:11.011770", "step": 2138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.048132", "step": 2138, "epoch": 3 }, { "type": "loss", "content": 0.0008165360777638853, "timestamp": "2025-09-15 03:21:11.050158", "step": 2139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.080698", "step": 2139, "epoch": 3 }, { "type": "loss", "content": 0.0024626676458865404, "timestamp": "2025-09-15 03:21:11.104499", "step": 2140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.136724", "step": 2140, "epoch": 3 }, { "type": "loss", "content": 0.0024115960113704205, "timestamp": "2025-09-15 03:21:11.152656", "step": 2141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.191628", "step": 2141, "epoch": 3 }, { "type": "loss", "content": 0.006479751318693161, "timestamp": "2025-09-15 03:21:11.194361", "step": 2142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.224551", "step": 2142, "epoch": 3 }, { "type": "loss", "content": 0.003326453035697341, "timestamp": "2025-09-15 03:21:11.226650", "step": 2143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.256885", "step": 2143, "epoch": 3 }, { "type": "loss", "content": 0.0039823888801038265, "timestamp": "2025-09-15 03:21:11.280947", "step": 2144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.312708", "step": 2144, "epoch": 3 }, { "type": "loss", "content": 0.0015015477547422051, "timestamp": "2025-09-15 03:21:11.314248", "step": 2145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.344818", "step": 2145, "epoch": 3 }, { "type": "loss", "content": 0.00112843734677881, "timestamp": "2025-09-15 03:21:11.346799", "step": 2146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.380143", "step": 2146, "epoch": 3 }, { "type": "loss", "content": 0.007287667598575354, "timestamp": "2025-09-15 03:21:11.382202", "step": 2147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.412237", "step": 2147, "epoch": 3 }, { "type": "loss", "content": 0.001394521677866578, "timestamp": "2025-09-15 03:21:11.435382", "step": 2148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.470791", "step": 2148, "epoch": 3 }, { "type": "loss", "content": 0.009362315759062767, "timestamp": "2025-09-15 03:21:11.475008", "step": 2149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.505632", "step": 2149, "epoch": 3 }, { "type": "loss", "content": 0.0004021845816168934, "timestamp": "2025-09-15 03:21:11.513474", "step": 2150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.544060", "step": 2150, "epoch": 3 }, { "type": "loss", "content": 0.008383884094655514, "timestamp": "2025-09-15 03:21:11.547090", "step": 2151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:11.577810", "step": 2151, "epoch": 3 }, { "type": "loss", "content": 0.004849635995924473, "timestamp": "2025-09-15 03:21:11.601340", "step": 2152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.633232", "step": 2152, "epoch": 3 }, { "type": "loss", "content": 0.004479328636080027, "timestamp": "2025-09-15 03:21:11.635441", "step": 2153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:11.667934", "step": 2153, "epoch": 3 }, { "type": "loss", "content": 0.0022254232317209244, "timestamp": "2025-09-15 03:21:11.671004", "step": 2154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:11.702775", "step": 2154, "epoch": 3 }, { "type": "loss", "content": 0.0015676198527216911, "timestamp": "2025-09-15 03:21:11.705045", "step": 2155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.735864", "step": 2155, "epoch": 3 }, { "type": "loss", "content": 0.004587561823427677, "timestamp": "2025-09-15 03:21:11.759918", "step": 2156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.796079", "step": 2156, "epoch": 3 }, { "type": "loss", "content": 0.0042901188135147095, "timestamp": "2025-09-15 03:21:11.798427", "step": 2157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:11.828770", "step": 2157, "epoch": 3 }, { "type": "loss", "content": 0.0016539504285901785, "timestamp": "2025-09-15 03:21:11.831852", "step": 2158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:11.862034", "step": 2158, "epoch": 3 }, { "type": "loss", "content": 0.0031304715666919947, "timestamp": "2025-09-15 03:21:11.864189", "step": 2159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.896918", "step": 2159, "epoch": 3 }, { "type": "loss", "content": 0.002231004647910595, "timestamp": "2025-09-15 03:21:11.920265", "step": 2160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:11.951703", "step": 2160, "epoch": 3 }, { "type": "loss", "content": 0.010940919630229473, "timestamp": "2025-09-15 03:21:11.953750", "step": 2161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:11.984076", "step": 2161, "epoch": 3 }, { "type": "loss", "content": 0.007696912158280611, "timestamp": "2025-09-15 03:21:11.986626", "step": 2162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:12.016941", "step": 2162, "epoch": 3 }, { "type": "loss", "content": 0.008095295168459415, "timestamp": "2025-09-15 03:21:12.018710", "step": 2163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:12.048871", "step": 2163, "epoch": 3 }, { "type": "loss", "content": 0.005427549593150616, "timestamp": "2025-09-15 03:21:12.072142", "step": 2164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:12.104165", "step": 2164, "epoch": 3 }, { "type": "loss", "content": 0.0006827453034929931, "timestamp": "2025-09-15 03:21:12.106216", "step": 2165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:12.136734", "step": 2165, "epoch": 3 }, { "type": "loss", "content": 0.004006941802799702, "timestamp": "2025-09-15 03:21:12.138836", "step": 2166, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:12.880619", "step": 2166, "epoch": 3 }, { "type": "pplx", "content": 56358345.9592242, "timestamp": "2025-09-15 03:21:12.882669", "step": 2166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:12.912218", "step": 2166, "epoch": 3 }, { "type": "loss", "content": 0.022966187447309494, "timestamp": "2025-09-15 03:21:12.914406", "step": 2167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:12.944500", "step": 2167, "epoch": 3 }, { "type": "loss", "content": 0.0029004632961004972, "timestamp": "2025-09-15 03:21:12.968545", "step": 2168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:12.999243", "step": 2168, "epoch": 3 }, { "type": "loss", "content": 0.005183414090424776, "timestamp": "2025-09-15 03:21:13.001340", "step": 2169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:13.032356", "step": 2169, "epoch": 3 }, { "type": "loss", "content": 0.0002951786736957729, "timestamp": "2025-09-15 03:21:13.034879", "step": 2170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.065815", "step": 2170, "epoch": 3 }, { "type": "loss", "content": 0.001299048657529056, "timestamp": "2025-09-15 03:21:13.067917", "step": 2171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.098628", "step": 2171, "epoch": 3 }, { "type": "loss", "content": 0.0007869719411246479, "timestamp": "2025-09-15 03:21:13.122233", "step": 2172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.153080", "step": 2172, "epoch": 3 }, { "type": "loss", "content": 0.0014901576796546578, "timestamp": "2025-09-15 03:21:13.155302", "step": 2173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.186918", "step": 2173, "epoch": 3 }, { "type": "loss", "content": 0.018634099513292313, "timestamp": "2025-09-15 03:21:13.189188", "step": 2174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:13.219873", "step": 2174, "epoch": 3 }, { "type": "loss", "content": 0.0033860153052955866, "timestamp": "2025-09-15 03:21:13.221763", "step": 2175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.252492", "step": 2175, "epoch": 3 }, { "type": "loss", "content": 0.0015741854440420866, "timestamp": "2025-09-15 03:21:13.275939", "step": 2176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.306567", "step": 2176, "epoch": 3 }, { "type": "loss", "content": 0.002750501735135913, "timestamp": "2025-09-15 03:21:13.308507", "step": 2177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.340242", "step": 2177, "epoch": 3 }, { "type": "loss", "content": 0.07448706775903702, "timestamp": "2025-09-15 03:21:13.342337", "step": 2178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.372602", "step": 2178, "epoch": 3 }, { "type": "loss", "content": 0.0007458441541530192, "timestamp": "2025-09-15 03:21:13.375390", "step": 2179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.406151", "step": 2179, "epoch": 3 }, { "type": "loss", "content": 0.0018982002511620522, "timestamp": "2025-09-15 03:21:13.429758", "step": 2180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.460680", "step": 2180, "epoch": 3 }, { "type": "loss", "content": 0.0009121177718043327, "timestamp": "2025-09-15 03:21:13.462911", "step": 2181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.493418", "step": 2181, "epoch": 3 }, { "type": "loss", "content": 0.0008120434358716011, "timestamp": "2025-09-15 03:21:13.495546", "step": 2182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.526592", "step": 2182, "epoch": 3 }, { "type": "loss", "content": 0.0011436872882768512, "timestamp": "2025-09-15 03:21:13.528907", "step": 2183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.559433", "step": 2183, "epoch": 3 }, { "type": "loss", "content": 0.0037187892012298107, "timestamp": "2025-09-15 03:21:13.583053", "step": 2184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.613327", "step": 2184, "epoch": 3 }, { "type": "loss", "content": 0.009429527446627617, "timestamp": "2025-09-15 03:21:13.615437", "step": 2185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.648087", "step": 2185, "epoch": 3 }, { "type": "loss", "content": 0.0013286563334986567, "timestamp": "2025-09-15 03:21:13.650170", "step": 2186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.680847", "step": 2186, "epoch": 3 }, { "type": "loss", "content": 0.02385484054684639, "timestamp": "2025-09-15 03:21:13.683310", "step": 2187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.714257", "step": 2187, "epoch": 3 }, { "type": "loss", "content": 0.002276848303154111, "timestamp": "2025-09-15 03:21:13.737795", "step": 2188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.768423", "step": 2188, "epoch": 3 }, { "type": "loss", "content": 0.005591376684606075, "timestamp": "2025-09-15 03:21:13.770729", "step": 2189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.801012", "step": 2189, "epoch": 3 }, { "type": "loss", "content": 0.002838743384927511, "timestamp": "2025-09-15 03:21:13.803036", "step": 2190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:13.833582", "step": 2190, "epoch": 3 }, { "type": "loss", "content": 0.010679845698177814, "timestamp": "2025-09-15 03:21:13.835668", "step": 2191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.865645", "step": 2191, "epoch": 3 }, { "type": "loss", "content": 0.0006234788452275097, "timestamp": "2025-09-15 03:21:13.889090", "step": 2192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:13.921076", "step": 2192, "epoch": 3 }, { "type": "loss", "content": 0.00037034088745713234, "timestamp": "2025-09-15 03:21:13.923242", "step": 2193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:13.954154", "step": 2193, "epoch": 3 }, { "type": "loss", "content": 0.016694631427526474, "timestamp": "2025-09-15 03:21:13.956542", "step": 2194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:13.986376", "step": 2194, "epoch": 3 }, { "type": "loss", "content": 0.000887808040715754, "timestamp": "2025-09-15 03:21:13.988785", "step": 2195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.019258", "step": 2195, "epoch": 3 }, { "type": "loss", "content": 0.03821894899010658, "timestamp": "2025-09-15 03:21:14.042892", "step": 2196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.073956", "step": 2196, "epoch": 3 }, { "type": "loss", "content": 0.018787646666169167, "timestamp": "2025-09-15 03:21:14.075896", "step": 2197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.106724", "step": 2197, "epoch": 3 }, { "type": "loss", "content": 0.0020286752842366695, "timestamp": "2025-09-15 03:21:14.108849", "step": 2198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.143937", "step": 2198, "epoch": 3 }, { "type": "loss", "content": 0.0011020256206393242, "timestamp": "2025-09-15 03:21:14.145897", "step": 2199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.177252", "step": 2199, "epoch": 3 }, { "type": "loss", "content": 0.000470021681394428, "timestamp": "2025-09-15 03:21:14.200843", "step": 2200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:14.231636", "step": 2200, "epoch": 3 }, { "type": "loss", "content": 0.0018282111268490553, "timestamp": "2025-09-15 03:21:14.234022", "step": 2201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.264926", "step": 2201, "epoch": 3 }, { "type": "loss", "content": 0.01462145708501339, "timestamp": "2025-09-15 03:21:14.267006", "step": 2202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.297935", "step": 2202, "epoch": 3 }, { "type": "loss", "content": 0.0006548243691213429, "timestamp": "2025-09-15 03:21:14.299962", "step": 2203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.330723", "step": 2203, "epoch": 3 }, { "type": "loss", "content": 0.0006687435670755804, "timestamp": "2025-09-15 03:21:14.354453", "step": 2204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.385874", "step": 2204, "epoch": 3 }, { "type": "loss", "content": 0.0005846781423315406, "timestamp": "2025-09-15 03:21:14.388104", "step": 2205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.418666", "step": 2205, "epoch": 3 }, { "type": "loss", "content": 0.0019477332243695855, "timestamp": "2025-09-15 03:21:14.420774", "step": 2206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.451662", "step": 2206, "epoch": 3 }, { "type": "loss", "content": 0.0016808919608592987, "timestamp": "2025-09-15 03:21:14.453842", "step": 2207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.484567", "step": 2207, "epoch": 3 }, { "type": "loss", "content": 0.010064328089356422, "timestamp": "2025-09-15 03:21:14.508105", "step": 2208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.539688", "step": 2208, "epoch": 3 }, { "type": "loss", "content": 0.0018260888755321503, "timestamp": "2025-09-15 03:21:14.542077", "step": 2209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.572446", "step": 2209, "epoch": 3 }, { "type": "loss", "content": 0.002042062347754836, "timestamp": "2025-09-15 03:21:14.574751", "step": 2210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.605689", "step": 2210, "epoch": 3 }, { "type": "loss", "content": 0.000529797631315887, "timestamp": "2025-09-15 03:21:14.607863", "step": 2211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.638139", "step": 2211, "epoch": 3 }, { "type": "loss", "content": 0.001175111741758883, "timestamp": "2025-09-15 03:21:14.661647", "step": 2212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.691892", "step": 2212, "epoch": 3 }, { "type": "loss", "content": 0.00947808288037777, "timestamp": "2025-09-15 03:21:14.694066", "step": 2213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.724412", "step": 2213, "epoch": 3 }, { "type": "loss", "content": 0.0023165601305663586, "timestamp": "2025-09-15 03:21:14.726573", "step": 2214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.757033", "step": 2214, "epoch": 3 }, { "type": "loss", "content": 0.0019492261344566941, "timestamp": "2025-09-15 03:21:14.758985", "step": 2215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:14.790110", "step": 2215, "epoch": 3 }, { "type": "loss", "content": 0.012912404723465443, "timestamp": "2025-09-15 03:21:14.813541", "step": 2216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.845333", "step": 2216, "epoch": 3 }, { "type": "loss", "content": 0.008791116066277027, "timestamp": "2025-09-15 03:21:14.847413", "step": 2217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.878065", "step": 2217, "epoch": 3 }, { "type": "loss", "content": 0.02137189917266369, "timestamp": "2025-09-15 03:21:14.880182", "step": 2218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:14.911002", "step": 2218, "epoch": 3 }, { "type": "loss", "content": 0.0007815445424057543, "timestamp": "2025-09-15 03:21:14.913266", "step": 2219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.945319", "step": 2219, "epoch": 3 }, { "type": "loss", "content": 0.0015050854999572039, "timestamp": "2025-09-15 03:21:14.968862", "step": 2220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:14.999022", "step": 2220, "epoch": 3 }, { "type": "loss", "content": 0.00025126116815954447, "timestamp": "2025-09-15 03:21:15.001187", "step": 2221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:15.031485", "step": 2221, "epoch": 3 }, { "type": "loss", "content": 0.0011576504912227392, "timestamp": "2025-09-15 03:21:15.033513", "step": 2222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:15.065030", "step": 2222, "epoch": 3 }, { "type": "loss", "content": 0.00018423503206577152, "timestamp": "2025-09-15 03:21:15.069373", "step": 2223, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:15.807235", "step": 2223, "epoch": 3 }, { "type": "pplx", "content": 62849603.449168004, "timestamp": "2025-09-15 03:21:15.809112", "step": 2223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:15.837733", "step": 2223, "epoch": 3 }, { "type": "loss", "content": 0.002890202449634671, "timestamp": "2025-09-15 03:21:15.861190", "step": 2224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:15.891662", "step": 2224, "epoch": 3 }, { "type": "loss", "content": 0.001774442265741527, "timestamp": "2025-09-15 03:21:15.893634", "step": 2225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:15.923668", "step": 2225, "epoch": 3 }, { "type": "loss", "content": 0.0007153319893404841, "timestamp": "2025-09-15 03:21:15.925724", "step": 2226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:15.956847", "step": 2226, "epoch": 3 }, { "type": "loss", "content": 0.01221728976815939, "timestamp": "2025-09-15 03:21:15.958817", "step": 2227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:15.989652", "step": 2227, "epoch": 3 }, { "type": "loss", "content": 0.01373725850135088, "timestamp": "2025-09-15 03:21:16.013004", "step": 2228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.043676", "step": 2228, "epoch": 3 }, { "type": "loss", "content": 0.00021498788555618376, "timestamp": "2025-09-15 03:21:16.045661", "step": 2229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.075571", "step": 2229, "epoch": 3 }, { "type": "loss", "content": 0.0012636370956897736, "timestamp": "2025-09-15 03:21:16.077641", "step": 2230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.107738", "step": 2230, "epoch": 3 }, { "type": "loss", "content": 9.673281601862982e-05, "timestamp": "2025-09-15 03:21:16.109877", "step": 2231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:16.140124", "step": 2231, "epoch": 3 }, { "type": "loss", "content": 0.012603862211108208, "timestamp": "2025-09-15 03:21:16.163586", "step": 2232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.194597", "step": 2232, "epoch": 3 }, { "type": "loss", "content": 0.0013240514090284705, "timestamp": "2025-09-15 03:21:16.196850", "step": 2233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.227367", "step": 2233, "epoch": 3 }, { "type": "loss", "content": 0.0012036258121952415, "timestamp": "2025-09-15 03:21:16.229378", "step": 2234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.260090", "step": 2234, "epoch": 3 }, { "type": "loss", "content": 0.0036719846539199352, "timestamp": "2025-09-15 03:21:16.262339", "step": 2235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:16.292842", "step": 2235, "epoch": 3 }, { "type": "loss", "content": 0.0012049399083480239, "timestamp": "2025-09-15 03:21:16.316527", "step": 2236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:16.346845", "step": 2236, "epoch": 3 }, { "type": "loss", "content": 0.0011176351690664887, "timestamp": "2025-09-15 03:21:16.348838", "step": 2237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.384654", "step": 2237, "epoch": 3 }, { "type": "loss", "content": 0.006942094769328833, "timestamp": "2025-09-15 03:21:16.386594", "step": 2238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:16.417748", "step": 2238, "epoch": 3 }, { "type": "loss", "content": 0.019507378339767456, "timestamp": "2025-09-15 03:21:16.419628", "step": 2239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:16.450329", "step": 2239, "epoch": 3 }, { "type": "loss", "content": 0.0027631439734250307, "timestamp": "2025-09-15 03:21:16.473991", "step": 2240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.504227", "step": 2240, "epoch": 3 }, { "type": "loss", "content": 0.0008822910604067147, "timestamp": "2025-09-15 03:21:16.506257", "step": 2241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:16.537441", "step": 2241, "epoch": 3 }, { "type": "loss", "content": 0.00015964095655363053, "timestamp": "2025-09-15 03:21:16.539530", "step": 2242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.570264", "step": 2242, "epoch": 3 }, { "type": "loss", "content": 0.0007311701192520559, "timestamp": "2025-09-15 03:21:16.572110", "step": 2243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.605715", "step": 2243, "epoch": 3 }, { "type": "loss", "content": 0.002480999333783984, "timestamp": "2025-09-15 03:21:16.629277", "step": 2244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:16.659788", "step": 2244, "epoch": 3 }, { "type": "loss", "content": 0.0024871390778571367, "timestamp": "2025-09-15 03:21:16.661685", "step": 2245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:16.691599", "step": 2245, "epoch": 3 }, { "type": "loss", "content": 0.006760665215551853, "timestamp": "2025-09-15 03:21:16.693867", "step": 2246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.724268", "step": 2246, "epoch": 3 }, { "type": "loss", "content": 0.0028482077177613974, "timestamp": "2025-09-15 03:21:16.726423", "step": 2247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.757079", "step": 2247, "epoch": 3 }, { "type": "loss", "content": 0.004411118105053902, "timestamp": "2025-09-15 03:21:16.780465", "step": 2248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.811929", "step": 2248, "epoch": 3 }, { "type": "loss", "content": 0.004571598023176193, "timestamp": "2025-09-15 03:21:16.819061", "step": 2249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.857899", "step": 2249, "epoch": 3 }, { "type": "loss", "content": 0.00016060953203123063, "timestamp": "2025-09-15 03:21:16.860019", "step": 2250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.891229", "step": 2250, "epoch": 3 }, { "type": "loss", "content": 0.0028591505251824856, "timestamp": "2025-09-15 03:21:16.893416", "step": 2251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.924543", "step": 2251, "epoch": 3 }, { "type": "loss", "content": 0.004463012330234051, "timestamp": "2025-09-15 03:21:16.947868", "step": 2252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:16.978508", "step": 2252, "epoch": 3 }, { "type": "loss", "content": 0.001512286951765418, "timestamp": "2025-09-15 03:21:16.980651", "step": 2253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:17.012210", "step": 2253, "epoch": 3 }, { "type": "loss", "content": 0.000547349511180073, "timestamp": "2025-09-15 03:21:17.013936", "step": 2254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.045543", "step": 2254, "epoch": 3 }, { "type": "loss", "content": 0.0014820124488323927, "timestamp": "2025-09-15 03:21:17.047769", "step": 2255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:17.077997", "step": 2255, "epoch": 3 }, { "type": "loss", "content": 8.133659866871312e-05, "timestamp": "2025-09-15 03:21:17.101492", "step": 2256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:17.132476", "step": 2256, "epoch": 3 }, { "type": "loss", "content": 8.59004576341249e-05, "timestamp": "2025-09-15 03:21:17.134733", "step": 2257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:17.167674", "step": 2257, "epoch": 3 }, { "type": "loss", "content": 0.0020845939870923758, "timestamp": "2025-09-15 03:21:17.169736", "step": 2258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:17.200606", "step": 2258, "epoch": 3 }, { "type": "loss", "content": 0.0009439904824830592, "timestamp": "2025-09-15 03:21:17.203663", "step": 2259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:17.233793", "step": 2259, "epoch": 3 }, { "type": "loss", "content": 0.004552639089524746, "timestamp": "2025-09-15 03:21:17.259120", "step": 2260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.294296", "step": 2260, "epoch": 3 }, { "type": "loss", "content": 0.002391277113929391, "timestamp": "2025-09-15 03:21:17.299354", "step": 2261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.330817", "step": 2261, "epoch": 3 }, { "type": "loss", "content": 0.0007121101371012628, "timestamp": "2025-09-15 03:21:17.333362", "step": 2262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.365177", "step": 2262, "epoch": 3 }, { "type": "loss", "content": 0.004409910179674625, "timestamp": "2025-09-15 03:21:17.367233", "step": 2263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:17.398381", "step": 2263, "epoch": 3 }, { "type": "loss", "content": 0.001686051837168634, "timestamp": "2025-09-15 03:21:17.421980", "step": 2264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.452806", "step": 2264, "epoch": 3 }, { "type": "loss", "content": 7.227421883726493e-05, "timestamp": "2025-09-15 03:21:17.455045", "step": 2265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:17.485671", "step": 2265, "epoch": 3 }, { "type": "loss", "content": 0.0006697883945889771, "timestamp": "2025-09-15 03:21:17.487839", "step": 2266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.518474", "step": 2266, "epoch": 3 }, { "type": "loss", "content": 0.002797195687890053, "timestamp": "2025-09-15 03:21:17.520562", "step": 2267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:17.550575", "step": 2267, "epoch": 3 }, { "type": "loss", "content": 0.0032074761111289263, "timestamp": "2025-09-15 03:21:17.573993", "step": 2268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.604180", "step": 2268, "epoch": 3 }, { "type": "loss", "content": 0.00026407671975903213, "timestamp": "2025-09-15 03:21:17.606205", "step": 2269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.636422", "step": 2269, "epoch": 3 }, { "type": "loss", "content": 0.0003585081431083381, "timestamp": "2025-09-15 03:21:17.639759", "step": 2270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.670297", "step": 2270, "epoch": 3 }, { "type": "loss", "content": 0.000742782314773649, "timestamp": "2025-09-15 03:21:17.672400", "step": 2271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.703628", "step": 2271, "epoch": 3 }, { "type": "loss", "content": 0.0006893987883813679, "timestamp": "2025-09-15 03:21:17.727178", "step": 2272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.757568", "step": 2272, "epoch": 3 }, { "type": "loss", "content": 0.0043635121546685696, "timestamp": "2025-09-15 03:21:17.759543", "step": 2273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.789442", "step": 2273, "epoch": 3 }, { "type": "loss", "content": 0.002095921663567424, "timestamp": "2025-09-15 03:21:17.791184", "step": 2274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.821480", "step": 2274, "epoch": 3 }, { "type": "loss", "content": 0.0013823070330545306, "timestamp": "2025-09-15 03:21:17.823469", "step": 2275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.853314", "step": 2275, "epoch": 3 }, { "type": "loss", "content": 0.006985916756093502, "timestamp": "2025-09-15 03:21:17.876854", "step": 2276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:17.907485", "step": 2276, "epoch": 3 }, { "type": "loss", "content": 0.00044175630318932235, "timestamp": "2025-09-15 03:21:17.909554", "step": 2277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:17.939414", "step": 2277, "epoch": 3 }, { "type": "loss", "content": 0.03420780971646309, "timestamp": "2025-09-15 03:21:17.941547", "step": 2278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:17.973125", "step": 2278, "epoch": 3 }, { "type": "loss", "content": 0.00027540497831068933, "timestamp": "2025-09-15 03:21:17.975592", "step": 2279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:18.007275", "step": 2279, "epoch": 3 }, { "type": "loss", "content": 0.00519139226526022, "timestamp": "2025-09-15 03:21:18.030680", "step": 2280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:18.770531", "step": 2280, "epoch": 3 }, { "type": "pplx", "content": 67494214.18423404, "timestamp": "2025-09-15 03:21:18.772658", "step": 2280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:18.800657", "step": 2280, "epoch": 3 }, { "type": "loss", "content": 0.00019070318376179785, "timestamp": "2025-09-15 03:21:18.802774", "step": 2281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:18.832388", "step": 2281, "epoch": 3 }, { "type": "loss", "content": 0.00020345590019132942, "timestamp": "2025-09-15 03:21:18.834650", "step": 2282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:18.864868", "step": 2282, "epoch": 3 }, { "type": "loss", "content": 8.062987762968987e-05, "timestamp": "2025-09-15 03:21:18.867230", "step": 2283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:18.897898", "step": 2283, "epoch": 3 }, { "type": "loss", "content": 0.0009913406101986766, "timestamp": "2025-09-15 03:21:18.921471", "step": 2284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:18.951838", "step": 2284, "epoch": 3 }, { "type": "loss", "content": 0.0008175976690836251, "timestamp": "2025-09-15 03:21:18.953996", "step": 2285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:18.985740", "step": 2285, "epoch": 3 }, { "type": "loss", "content": 7.580334931844845e-05, "timestamp": "2025-09-15 03:21:18.988295", "step": 2286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.019438", "step": 2286, "epoch": 3 }, { "type": "loss", "content": 0.00028774369275197387, "timestamp": "2025-09-15 03:21:19.021740", "step": 2287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:19.054509", "step": 2287, "epoch": 3 }, { "type": "loss", "content": 0.0005485046422109008, "timestamp": "2025-09-15 03:21:19.080024", "step": 2288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:19.110975", "step": 2288, "epoch": 3 }, { "type": "loss", "content": 0.00015744587290100753, "timestamp": "2025-09-15 03:21:19.113163", "step": 2289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.144343", "step": 2289, "epoch": 3 }, { "type": "loss", "content": 0.0001991561584873125, "timestamp": "2025-09-15 03:21:19.146647", "step": 2290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.177367", "step": 2290, "epoch": 3 }, { "type": "loss", "content": 0.008997390046715736, "timestamp": "2025-09-15 03:21:19.179781", "step": 2291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:19.210684", "step": 2291, "epoch": 3 }, { "type": "loss", "content": 0.002861560555174947, "timestamp": "2025-09-15 03:21:19.234419", "step": 2292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.264635", "step": 2292, "epoch": 3 }, { "type": "loss", "content": 0.0005258307792246342, "timestamp": "2025-09-15 03:21:19.266801", "step": 2293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:19.296948", "step": 2293, "epoch": 3 }, { "type": "loss", "content": 0.0035565574653446674, "timestamp": "2025-09-15 03:21:19.299181", "step": 2294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.329233", "step": 2294, "epoch": 3 }, { "type": "loss", "content": 0.001044388976879418, "timestamp": "2025-09-15 03:21:19.331464", "step": 2295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.361435", "step": 2295, "epoch": 3 }, { "type": "loss", "content": 0.0002443413541186601, "timestamp": "2025-09-15 03:21:19.385263", "step": 2296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.415785", "step": 2296, "epoch": 3 }, { "type": "loss", "content": 0.0006027042982168496, "timestamp": "2025-09-15 03:21:19.417901", "step": 2297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.448106", "step": 2297, "epoch": 3 }, { "type": "loss", "content": 0.0026935446076095104, "timestamp": "2025-09-15 03:21:19.450301", "step": 2298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.480885", "step": 2298, "epoch": 3 }, { "type": "loss", "content": 0.00013987746206112206, "timestamp": "2025-09-15 03:21:19.483185", "step": 2299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.512985", "step": 2299, "epoch": 3 }, { "type": "loss", "content": 0.0003008446656167507, "timestamp": "2025-09-15 03:21:19.537256", "step": 2300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.567766", "step": 2300, "epoch": 3 }, { "type": "loss", "content": 0.0002455053327139467, "timestamp": "2025-09-15 03:21:19.569739", "step": 2301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.599903", "step": 2301, "epoch": 3 }, { "type": "loss", "content": 7.855286821722984e-05, "timestamp": "2025-09-15 03:21:19.601961", "step": 2302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.633244", "step": 2302, "epoch": 3 }, { "type": "loss", "content": 0.018144680187106133, "timestamp": "2025-09-15 03:21:19.635340", "step": 2303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.665144", "step": 2303, "epoch": 3 }, { "type": "loss", "content": 0.0017443523975089192, "timestamp": "2025-09-15 03:21:19.688834", "step": 2304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.719007", "step": 2304, "epoch": 3 }, { "type": "loss", "content": 0.00037380401045084, "timestamp": "2025-09-15 03:21:19.721921", "step": 2305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.751988", "step": 2305, "epoch": 3 }, { "type": "loss", "content": 0.0004997936775907874, "timestamp": "2025-09-15 03:21:19.754299", "step": 2306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:19.784702", "step": 2306, "epoch": 3 }, { "type": "loss", "content": 0.014436488971114159, "timestamp": "2025-09-15 03:21:19.786890", "step": 2307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:19.817394", "step": 2307, "epoch": 3 }, { "type": "loss", "content": 0.00046557295718230307, "timestamp": "2025-09-15 03:21:19.841063", "step": 2308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:19.871549", "step": 2308, "epoch": 3 }, { "type": "loss", "content": 0.00021677868789993227, "timestamp": "2025-09-15 03:21:19.873851", "step": 2309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.904416", "step": 2309, "epoch": 3 }, { "type": "loss", "content": 0.0003104743082076311, "timestamp": "2025-09-15 03:21:19.906634", "step": 2310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.936762", "step": 2310, "epoch": 3 }, { "type": "loss", "content": 0.0001938850909937173, "timestamp": "2025-09-15 03:21:19.938781", "step": 2311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:19.969518", "step": 2311, "epoch": 3 }, { "type": "loss", "content": 0.0005853885086253285, "timestamp": "2025-09-15 03:21:19.992993", "step": 2312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.023242", "step": 2312, "epoch": 3 }, { "type": "loss", "content": 0.00036151515087112784, "timestamp": "2025-09-15 03:21:20.025591", "step": 2313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.055885", "step": 2313, "epoch": 3 }, { "type": "loss", "content": 8.350707503268495e-05, "timestamp": "2025-09-15 03:21:20.058007", "step": 2314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.087636", "step": 2314, "epoch": 3 }, { "type": "loss", "content": 0.0010728834895417094, "timestamp": "2025-09-15 03:21:20.089729", "step": 2315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.119777", "step": 2315, "epoch": 3 }, { "type": "loss", "content": 0.00021373596973717213, "timestamp": "2025-09-15 03:21:20.143249", "step": 2316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.173933", "step": 2316, "epoch": 3 }, { "type": "loss", "content": 8.931905904319137e-05, "timestamp": "2025-09-15 03:21:20.176033", "step": 2317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.208500", "step": 2317, "epoch": 3 }, { "type": "loss", "content": 0.0002284930378664285, "timestamp": "2025-09-15 03:21:20.210577", "step": 2318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.242319", "step": 2318, "epoch": 3 }, { "type": "loss", "content": 0.00015003184671513736, "timestamp": "2025-09-15 03:21:20.244631", "step": 2319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.276639", "step": 2319, "epoch": 3 }, { "type": "loss", "content": 0.0008775495225563645, "timestamp": "2025-09-15 03:21:20.300191", "step": 2320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:20.331757", "step": 2320, "epoch": 3 }, { "type": "loss", "content": 0.0004940013168379664, "timestamp": "2025-09-15 03:21:20.333891", "step": 2321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.364748", "step": 2321, "epoch": 3 }, { "type": "loss", "content": 0.00012955373676959425, "timestamp": "2025-09-15 03:21:20.366796", "step": 2322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.397555", "step": 2322, "epoch": 3 }, { "type": "loss", "content": 0.00020479969680309296, "timestamp": "2025-09-15 03:21:20.399821", "step": 2323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.430759", "step": 2323, "epoch": 3 }, { "type": "loss", "content": 0.00027445584419183433, "timestamp": "2025-09-15 03:21:20.454353", "step": 2324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.485387", "step": 2324, "epoch": 3 }, { "type": "loss", "content": 0.00012975472782272846, "timestamp": "2025-09-15 03:21:20.487673", "step": 2325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.519308", "step": 2325, "epoch": 3 }, { "type": "loss", "content": 0.0025794480461627245, "timestamp": "2025-09-15 03:21:20.521621", "step": 2326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.551687", "step": 2326, "epoch": 3 }, { "type": "loss", "content": 0.005738633684813976, "timestamp": "2025-09-15 03:21:20.553942", "step": 2327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.585340", "step": 2327, "epoch": 3 }, { "type": "loss", "content": 0.0008769879932515323, "timestamp": "2025-09-15 03:21:20.608962", "step": 2328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.639383", "step": 2328, "epoch": 3 }, { "type": "loss", "content": 0.0002040156105067581, "timestamp": "2025-09-15 03:21:20.641512", "step": 2329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.671473", "step": 2329, "epoch": 3 }, { "type": "loss", "content": 0.00022350263316184282, "timestamp": "2025-09-15 03:21:20.673708", "step": 2330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.704482", "step": 2330, "epoch": 3 }, { "type": "loss", "content": 0.00019051216077059507, "timestamp": "2025-09-15 03:21:20.706634", "step": 2331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:20.738146", "step": 2331, "epoch": 3 }, { "type": "loss", "content": 0.001020329655148089, "timestamp": "2025-09-15 03:21:20.761649", "step": 2332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.792571", "step": 2332, "epoch": 3 }, { "type": "loss", "content": 0.00016312638763338327, "timestamp": "2025-09-15 03:21:20.794977", "step": 2333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.825203", "step": 2333, "epoch": 3 }, { "type": "loss", "content": 0.00018660161003936082, "timestamp": "2025-09-15 03:21:20.827348", "step": 2334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.857435", "step": 2334, "epoch": 3 }, { "type": "loss", "content": 0.013526364229619503, "timestamp": "2025-09-15 03:21:20.859717", "step": 2335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:20.890701", "step": 2335, "epoch": 3 }, { "type": "loss", "content": 0.0001784945634426549, "timestamp": "2025-09-15 03:21:20.914265", "step": 2336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:20.944920", "step": 2336, "epoch": 3 }, { "type": "loss", "content": 0.003306002588942647, "timestamp": "2025-09-15 03:21:20.946961", "step": 2337, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:21.679516", "step": 2337, "epoch": 3 }, { "type": "pplx", "content": 60885470.31679232, "timestamp": "2025-09-15 03:21:21.681134", "step": 2337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.709786", "step": 2337, "epoch": 3 }, { "type": "loss", "content": 0.00014974501391407102, "timestamp": "2025-09-15 03:21:21.712019", "step": 2338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.741925", "step": 2338, "epoch": 3 }, { "type": "loss", "content": 0.00010141759412363172, "timestamp": "2025-09-15 03:21:21.743945", "step": 2339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.774134", "step": 2339, "epoch": 3 }, { "type": "loss", "content": 0.0013717318652197719, "timestamp": "2025-09-15 03:21:21.797646", "step": 2340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:21.828752", "step": 2340, "epoch": 3 }, { "type": "loss", "content": 0.0010869607795029879, "timestamp": "2025-09-15 03:21:21.831018", "step": 2341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:21.860961", "step": 2341, "epoch": 3 }, { "type": "loss", "content": 0.001997420797124505, "timestamp": "2025-09-15 03:21:21.863369", "step": 2342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.894261", "step": 2342, "epoch": 3 }, { "type": "loss", "content": 0.00011396995250834152, "timestamp": "2025-09-15 03:21:21.896439", "step": 2343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.926555", "step": 2343, "epoch": 3 }, { "type": "loss", "content": 0.0003269371227361262, "timestamp": "2025-09-15 03:21:21.949966", "step": 2344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:21.980383", "step": 2344, "epoch": 3 }, { "type": "loss", "content": 0.010744208469986916, "timestamp": "2025-09-15 03:21:21.982465", "step": 2345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.012953", "step": 2345, "epoch": 3 }, { "type": "loss", "content": 0.00023648412025067955, "timestamp": "2025-09-15 03:21:22.015623", "step": 2346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.045606", "step": 2346, "epoch": 3 }, { "type": "loss", "content": 0.00028562903753481805, "timestamp": "2025-09-15 03:21:22.047791", "step": 2347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.078104", "step": 2347, "epoch": 3 }, { "type": "loss", "content": 0.00023739961034152657, "timestamp": "2025-09-15 03:21:22.101647", "step": 2348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.131433", "step": 2348, "epoch": 3 }, { "type": "loss", "content": 0.00021465822646860033, "timestamp": "2025-09-15 03:21:22.133327", "step": 2349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.163473", "step": 2349, "epoch": 3 }, { "type": "loss", "content": 0.002316099824383855, "timestamp": "2025-09-15 03:21:22.165609", "step": 2350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:22.195887", "step": 2350, "epoch": 3 }, { "type": "loss", "content": 0.0008549098274670541, "timestamp": "2025-09-15 03:21:22.198549", "step": 2351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.228745", "step": 2351, "epoch": 3 }, { "type": "loss", "content": 0.0005782007938250899, "timestamp": "2025-09-15 03:21:22.252244", "step": 2352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.282795", "step": 2352, "epoch": 3 }, { "type": "loss", "content": 0.00010357372957514599, "timestamp": "2025-09-15 03:21:22.284879", "step": 2353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.316229", "step": 2353, "epoch": 3 }, { "type": "loss", "content": 0.0057270945981144905, "timestamp": "2025-09-15 03:21:22.318317", "step": 2354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.348685", "step": 2354, "epoch": 3 }, { "type": "loss", "content": 0.0004270931822247803, "timestamp": "2025-09-15 03:21:22.350847", "step": 2355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.381214", "step": 2355, "epoch": 3 }, { "type": "loss", "content": 0.00011959804396610707, "timestamp": "2025-09-15 03:21:22.404826", "step": 2356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.435320", "step": 2356, "epoch": 3 }, { "type": "loss", "content": 0.0002549807832110673, "timestamp": "2025-09-15 03:21:22.437394", "step": 2357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.468229", "step": 2357, "epoch": 3 }, { "type": "loss", "content": 0.0006122398190200329, "timestamp": "2025-09-15 03:21:22.470293", "step": 2358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.500178", "step": 2358, "epoch": 3 }, { "type": "loss", "content": 0.0005783793749287724, "timestamp": "2025-09-15 03:21:22.502304", "step": 2359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.532713", "step": 2359, "epoch": 3 }, { "type": "loss", "content": 0.005947014782577753, "timestamp": "2025-09-15 03:21:22.556353", "step": 2360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.586717", "step": 2360, "epoch": 3 }, { "type": "loss", "content": 0.00028083566576242447, "timestamp": "2025-09-15 03:21:22.588954", "step": 2361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.618813", "step": 2361, "epoch": 3 }, { "type": "loss", "content": 0.00033108098432421684, "timestamp": "2025-09-15 03:21:22.620940", "step": 2362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.650854", "step": 2362, "epoch": 3 }, { "type": "loss", "content": 0.0001367040240438655, "timestamp": "2025-09-15 03:21:22.652864", "step": 2363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.682727", "step": 2363, "epoch": 3 }, { "type": "loss", "content": 0.00224878778681159, "timestamp": "2025-09-15 03:21:22.705846", "step": 2364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.736412", "step": 2364, "epoch": 3 }, { "type": "loss", "content": 0.0010682273423299193, "timestamp": "2025-09-15 03:21:22.738755", "step": 2365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.769505", "step": 2365, "epoch": 3 }, { "type": "loss", "content": 0.005659495014697313, "timestamp": "2025-09-15 03:21:22.771479", "step": 2366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.801326", "step": 2366, "epoch": 3 }, { "type": "loss", "content": 0.006955933757126331, "timestamp": "2025-09-15 03:21:22.803365", "step": 2367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.834409", "step": 2367, "epoch": 3 }, { "type": "loss", "content": 0.0006349317845888436, "timestamp": "2025-09-15 03:21:22.857825", "step": 2368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.887859", "step": 2368, "epoch": 3 }, { "type": "loss", "content": 0.0021447453182190657, "timestamp": "2025-09-15 03:21:22.889863", "step": 2369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.919385", "step": 2369, "epoch": 3 }, { "type": "loss", "content": 0.0001313880638917908, "timestamp": "2025-09-15 03:21:22.921417", "step": 2370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:22.951515", "step": 2370, "epoch": 3 }, { "type": "loss", "content": 0.00024139614833984524, "timestamp": "2025-09-15 03:21:22.953681", "step": 2371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:22.984772", "step": 2371, "epoch": 3 }, { "type": "loss", "content": 0.00474166963249445, "timestamp": "2025-09-15 03:21:23.008291", "step": 2372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.038395", "step": 2372, "epoch": 3 }, { "type": "loss", "content": 0.0003615278110373765, "timestamp": "2025-09-15 03:21:23.040435", "step": 2373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.071395", "step": 2373, "epoch": 3 }, { "type": "loss", "content": 0.0063685099594295025, "timestamp": "2025-09-15 03:21:23.073744", "step": 2374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.104428", "step": 2374, "epoch": 3 }, { "type": "loss", "content": 0.0005614294786937535, "timestamp": "2025-09-15 03:21:23.106666", "step": 2375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.138159", "step": 2375, "epoch": 3 }, { "type": "loss", "content": 0.04361369088292122, "timestamp": "2025-09-15 03:21:23.161706", "step": 2376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.191616", "step": 2376, "epoch": 3 }, { "type": "loss", "content": 0.00016917834000196308, "timestamp": "2025-09-15 03:21:23.193382", "step": 2377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.223222", "step": 2377, "epoch": 3 }, { "type": "loss", "content": 0.0740358904004097, "timestamp": "2025-09-15 03:21:23.225488", "step": 2378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.255712", "step": 2378, "epoch": 3 }, { "type": "loss", "content": 0.0010343164904043078, "timestamp": "2025-09-15 03:21:23.257944", "step": 2379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.288526", "step": 2379, "epoch": 3 }, { "type": "loss", "content": 0.0007979201036505401, "timestamp": "2025-09-15 03:21:23.311986", "step": 2380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.342550", "step": 2380, "epoch": 3 }, { "type": "loss", "content": 0.00026409697602503, "timestamp": "2025-09-15 03:21:23.344552", "step": 2381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:23.374381", "step": 2381, "epoch": 3 }, { "type": "loss", "content": 0.031576044857501984, "timestamp": "2025-09-15 03:21:23.376382", "step": 2382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.406257", "step": 2382, "epoch": 3 }, { "type": "loss", "content": 0.0009410924976691604, "timestamp": "2025-09-15 03:21:23.408369", "step": 2383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.438367", "step": 2383, "epoch": 3 }, { "type": "loss", "content": 0.006526883225888014, "timestamp": "2025-09-15 03:21:23.461644", "step": 2384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.493187", "step": 2384, "epoch": 3 }, { "type": "loss", "content": 0.0004621174302883446, "timestamp": "2025-09-15 03:21:23.495243", "step": 2385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.525161", "step": 2385, "epoch": 3 }, { "type": "loss", "content": 0.00030433182837441564, "timestamp": "2025-09-15 03:21:23.527347", "step": 2386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.558758", "step": 2386, "epoch": 3 }, { "type": "loss", "content": 0.001242363709025085, "timestamp": "2025-09-15 03:21:23.560809", "step": 2387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.591637", "step": 2387, "epoch": 3 }, { "type": "loss", "content": 0.03715752810239792, "timestamp": "2025-09-15 03:21:23.615062", "step": 2388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:23.646027", "step": 2388, "epoch": 3 }, { "type": "loss", "content": 0.05452625826001167, "timestamp": "2025-09-15 03:21:23.648252", "step": 2389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.678368", "step": 2389, "epoch": 3 }, { "type": "loss", "content": 0.0011189163196831942, "timestamp": "2025-09-15 03:21:23.680893", "step": 2390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:23.714005", "step": 2390, "epoch": 3 }, { "type": "loss", "content": 0.02448815479874611, "timestamp": "2025-09-15 03:21:23.716919", "step": 2391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.747137", "step": 2391, "epoch": 3 }, { "type": "loss", "content": 0.020943596959114075, "timestamp": "2025-09-15 03:21:23.770523", "step": 2392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.801773", "step": 2392, "epoch": 3 }, { "type": "loss", "content": 0.022623786702752113, "timestamp": "2025-09-15 03:21:23.803869", "step": 2393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:23.833998", "step": 2393, "epoch": 3 }, { "type": "loss", "content": 0.00040333118522539735, "timestamp": "2025-09-15 03:21:23.836414", "step": 2394, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:24.564591", "step": 2394, "epoch": 3 }, { "type": "pplx", "content": 65587889.72230218, "timestamp": "2025-09-15 03:21:24.566516", "step": 2394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.594840", "step": 2394, "epoch": 3 }, { "type": "loss", "content": 0.0008957642712630332, "timestamp": "2025-09-15 03:21:24.596892", "step": 2395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.626731", "step": 2395, "epoch": 3 }, { "type": "loss", "content": 0.001257409923709929, "timestamp": "2025-09-15 03:21:24.651088", "step": 2396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.681378", "step": 2396, "epoch": 3 }, { "type": "loss", "content": 0.014478609897196293, "timestamp": "2025-09-15 03:21:24.683456", "step": 2397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.715105", "step": 2397, "epoch": 3 }, { "type": "loss", "content": 0.01863393373787403, "timestamp": "2025-09-15 03:21:24.717097", "step": 2398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:24.747328", "step": 2398, "epoch": 3 }, { "type": "loss", "content": 0.03859979286789894, "timestamp": "2025-09-15 03:21:24.749398", "step": 2399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.779655", "step": 2399, "epoch": 3 }, { "type": "loss", "content": 0.035878926515579224, "timestamp": "2025-09-15 03:21:24.803414", "step": 2400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.833484", "step": 2400, "epoch": 3 }, { "type": "loss", "content": 0.002484408440068364, "timestamp": "2025-09-15 03:21:24.835694", "step": 2401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:24.866337", "step": 2401, "epoch": 3 }, { "type": "loss", "content": 0.03723729029297829, "timestamp": "2025-09-15 03:21:24.869018", "step": 2402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.899566", "step": 2402, "epoch": 3 }, { "type": "loss", "content": 0.007320962380617857, "timestamp": "2025-09-15 03:21:24.901597", "step": 2403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.931335", "step": 2403, "epoch": 3 }, { "type": "loss", "content": 0.009379333816468716, "timestamp": "2025-09-15 03:21:24.955085", "step": 2404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:24.985436", "step": 2404, "epoch": 3 }, { "type": "loss", "content": 0.002660617232322693, "timestamp": "2025-09-15 03:21:24.987405", "step": 2405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.018379", "step": 2405, "epoch": 3 }, { "type": "loss", "content": 0.0032046171836555004, "timestamp": "2025-09-15 03:21:25.020380", "step": 2406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:25.050837", "step": 2406, "epoch": 3 }, { "type": "loss", "content": 0.01012934185564518, "timestamp": "2025-09-15 03:21:25.052941", "step": 2407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.083056", "step": 2407, "epoch": 3 }, { "type": "loss", "content": 0.010252327658236027, "timestamp": "2025-09-15 03:21:25.106652", "step": 2408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.137235", "step": 2408, "epoch": 3 }, { "type": "loss", "content": 0.006697942037135363, "timestamp": "2025-09-15 03:21:25.139231", "step": 2409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.169674", "step": 2409, "epoch": 3 }, { "type": "loss", "content": 0.020396556705236435, "timestamp": "2025-09-15 03:21:25.172033", "step": 2410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:25.202155", "step": 2410, "epoch": 3 }, { "type": "loss", "content": 0.010722288861870766, "timestamp": "2025-09-15 03:21:25.204307", "step": 2411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:25.234498", "step": 2411, "epoch": 3 }, { "type": "loss", "content": 0.022731030359864235, "timestamp": "2025-09-15 03:21:25.258449", "step": 2412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.288794", "step": 2412, "epoch": 3 }, { "type": "loss", "content": 0.004771314561367035, "timestamp": "2025-09-15 03:21:25.290864", "step": 2413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.320680", "step": 2413, "epoch": 3 }, { "type": "loss", "content": 0.00403338298201561, "timestamp": "2025-09-15 03:21:25.322721", "step": 2414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:25.353003", "step": 2414, "epoch": 3 }, { "type": "loss", "content": 0.005938275717198849, "timestamp": "2025-09-15 03:21:25.355364", "step": 2415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.385657", "step": 2415, "epoch": 3 }, { "type": "loss", "content": 0.0006778505048714578, "timestamp": "2025-09-15 03:21:25.409300", "step": 2416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:25.439782", "step": 2416, "epoch": 3 }, { "type": "loss", "content": 0.007004484534263611, "timestamp": "2025-09-15 03:21:25.442040", "step": 2417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:25.472455", "step": 2417, "epoch": 3 }, { "type": "loss", "content": 0.008051480166614056, "timestamp": "2025-09-15 03:21:25.474815", "step": 2418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.505229", "step": 2418, "epoch": 3 }, { "type": "loss", "content": 0.015677032992243767, "timestamp": "2025-09-15 03:21:25.507344", "step": 2419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.538006", "step": 2419, "epoch": 3 }, { "type": "loss", "content": 0.0012792375637218356, "timestamp": "2025-09-15 03:21:25.562179", "step": 2420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.592334", "step": 2420, "epoch": 3 }, { "type": "loss", "content": 0.000992942019365728, "timestamp": "2025-09-15 03:21:25.594510", "step": 2421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:25.625590", "step": 2421, "epoch": 3 }, { "type": "loss", "content": 0.04845261946320534, "timestamp": "2025-09-15 03:21:25.627798", "step": 2422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.657733", "step": 2422, "epoch": 3 }, { "type": "loss", "content": 0.031112248077988625, "timestamp": "2025-09-15 03:21:25.659871", "step": 2423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.690334", "step": 2423, "epoch": 3 }, { "type": "loss", "content": 0.011084296740591526, "timestamp": "2025-09-15 03:21:25.713847", "step": 2424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-15 03:21:25.744747", "step": 2424, "epoch": 3 }, { "type": "loss", "content": 0.019527485594153404, "timestamp": "2025-09-15 03:21:25.746744", "step": 2425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.777085", "step": 2425, "epoch": 3 }, { "type": "loss", "content": 0.007705447729676962, "timestamp": "2025-09-15 03:21:25.779132", "step": 2426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.808627", "step": 2426, "epoch": 3 }, { "type": "loss", "content": 0.01861448585987091, "timestamp": "2025-09-15 03:21:25.810926", "step": 2427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:25.841556", "step": 2427, "epoch": 3 }, { "type": "loss", "content": 0.0005472367047332227, "timestamp": "2025-09-15 03:21:25.866135", "step": 2428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.896651", "step": 2428, "epoch": 3 }, { "type": "loss", "content": 0.009006543084979057, "timestamp": "2025-09-15 03:21:25.898930", "step": 2429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.929182", "step": 2429, "epoch": 3 }, { "type": "loss", "content": 0.018193485215306282, "timestamp": "2025-09-15 03:21:25.931260", "step": 2430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:25.961692", "step": 2430, "epoch": 3 }, { "type": "loss", "content": 0.010625666007399559, "timestamp": "2025-09-15 03:21:25.963980", "step": 2431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:25.993978", "step": 2431, "epoch": 3 }, { "type": "loss", "content": 0.013900967314839363, "timestamp": "2025-09-15 03:21:26.017580", "step": 2432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:26.048407", "step": 2432, "epoch": 3 }, { "type": "loss", "content": 0.011567777954041958, "timestamp": "2025-09-15 03:21:26.050465", "step": 2433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.081193", "step": 2433, "epoch": 3 }, { "type": "loss", "content": 0.017127785831689835, "timestamp": "2025-09-15 03:21:26.083207", "step": 2434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.113332", "step": 2434, "epoch": 3 }, { "type": "loss", "content": 0.007701248396188021, "timestamp": "2025-09-15 03:21:26.115405", "step": 2435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.145657", "step": 2435, "epoch": 3 }, { "type": "loss", "content": 0.012019234709441662, "timestamp": "2025-09-15 03:21:26.169875", "step": 2436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.200148", "step": 2436, "epoch": 3 }, { "type": "loss", "content": 0.01517223659902811, "timestamp": "2025-09-15 03:21:26.202118", "step": 2437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.232074", "step": 2437, "epoch": 3 }, { "type": "loss", "content": 0.01354182232171297, "timestamp": "2025-09-15 03:21:26.234436", "step": 2438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.264584", "step": 2438, "epoch": 3 }, { "type": "loss", "content": 0.008238118141889572, "timestamp": "2025-09-15 03:21:26.266934", "step": 2439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:26.297598", "step": 2439, "epoch": 3 }, { "type": "loss", "content": 0.009475002065300941, "timestamp": "2025-09-15 03:21:26.321399", "step": 2440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.352095", "step": 2440, "epoch": 3 }, { "type": "loss", "content": 0.009594145230948925, "timestamp": "2025-09-15 03:21:26.354087", "step": 2441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:26.384631", "step": 2441, "epoch": 3 }, { "type": "loss", "content": 0.0014280823525041342, "timestamp": "2025-09-15 03:21:26.386828", "step": 2442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.417928", "step": 2442, "epoch": 3 }, { "type": "loss", "content": 0.0034047921653836966, "timestamp": "2025-09-15 03:21:26.419964", "step": 2443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:26.450878", "step": 2443, "epoch": 3 }, { "type": "loss", "content": 0.006948020774871111, "timestamp": "2025-09-15 03:21:26.474651", "step": 2444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.507541", "step": 2444, "epoch": 3 }, { "type": "loss", "content": 0.015213142149150372, "timestamp": "2025-09-15 03:21:26.509806", "step": 2445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.541494", "step": 2445, "epoch": 3 }, { "type": "loss", "content": 0.02016303315758705, "timestamp": "2025-09-15 03:21:26.546937", "step": 2446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.579146", "step": 2446, "epoch": 3 }, { "type": "loss", "content": 0.01299526821821928, "timestamp": "2025-09-15 03:21:26.581246", "step": 2447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.613740", "step": 2447, "epoch": 3 }, { "type": "loss", "content": 0.0024096709676086903, "timestamp": "2025-09-15 03:21:26.640995", "step": 2448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.671754", "step": 2448, "epoch": 3 }, { "type": "loss", "content": 0.0309018325060606, "timestamp": "2025-09-15 03:21:26.673885", "step": 2449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:26.706085", "step": 2449, "epoch": 3 }, { "type": "loss", "content": 0.014632557518780231, "timestamp": "2025-09-15 03:21:26.708248", "step": 2450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:26.739898", "step": 2450, "epoch": 3 }, { "type": "loss", "content": 0.01982324756681919, "timestamp": "2025-09-15 03:21:26.741997", "step": 2451, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:27.590220", "step": 2451, "epoch": 3 }, { "type": "pplx", "content": 68557706.19116534, "timestamp": "2025-09-15 03:21:27.592181", "step": 2451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.621170", "step": 2451, "epoch": 3 }, { "type": "loss", "content": 0.009094307199120522, "timestamp": "2025-09-15 03:21:27.648167", "step": 2452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.680986", "step": 2452, "epoch": 3 }, { "type": "loss", "content": 0.011998111382126808, "timestamp": "2025-09-15 03:21:27.683238", "step": 2453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.714151", "step": 2453, "epoch": 3 }, { "type": "loss", "content": 0.00863865576684475, "timestamp": "2025-09-15 03:21:27.716394", "step": 2454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:27.749598", "step": 2454, "epoch": 3 }, { "type": "loss", "content": 0.035453762859106064, "timestamp": "2025-09-15 03:21:27.751770", "step": 2455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:27.782656", "step": 2455, "epoch": 3 }, { "type": "loss", "content": 0.0025879372842609882, "timestamp": "2025-09-15 03:21:27.806266", "step": 2456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.837289", "step": 2456, "epoch": 3 }, { "type": "loss", "content": 0.0030494672246277332, "timestamp": "2025-09-15 03:21:27.841544", "step": 2457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.877047", "step": 2457, "epoch": 3 }, { "type": "loss", "content": 0.009609291329979897, "timestamp": "2025-09-15 03:21:27.879293", "step": 2458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.911685", "step": 2458, "epoch": 3 }, { "type": "loss", "content": 0.008864649571478367, "timestamp": "2025-09-15 03:21:27.913781", "step": 2459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:27.945302", "step": 2459, "epoch": 3 }, { "type": "loss", "content": 0.0009798811515793204, "timestamp": "2025-09-15 03:21:27.973847", "step": 2460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.004516", "step": 2460, "epoch": 3 }, { "type": "loss", "content": 0.004546779673546553, "timestamp": "2025-09-15 03:21:28.006672", "step": 2461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.038157", "step": 2461, "epoch": 3 }, { "type": "loss", "content": 0.010944445617496967, "timestamp": "2025-09-15 03:21:28.040477", "step": 2462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.071077", "step": 2462, "epoch": 3 }, { "type": "loss", "content": 0.028137998655438423, "timestamp": "2025-09-15 03:21:28.074083", "step": 2463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.109670", "step": 2463, "epoch": 3 }, { "type": "loss", "content": 0.04601803421974182, "timestamp": "2025-09-15 03:21:28.133440", "step": 2464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.165002", "step": 2464, "epoch": 3 }, { "type": "loss", "content": 0.007286368403583765, "timestamp": "2025-09-15 03:21:28.167538", "step": 2465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.206360", "step": 2465, "epoch": 3 }, { "type": "loss", "content": 0.012363625690340996, "timestamp": "2025-09-15 03:21:28.209071", "step": 2466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.241590", "step": 2466, "epoch": 3 }, { "type": "loss", "content": 0.007483890745788813, "timestamp": "2025-09-15 03:21:28.243840", "step": 2467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.275003", "step": 2467, "epoch": 3 }, { "type": "loss", "content": 0.021944841369986534, "timestamp": "2025-09-15 03:21:28.300640", "step": 2468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.332649", "step": 2468, "epoch": 3 }, { "type": "loss", "content": 0.01066376268863678, "timestamp": "2025-09-15 03:21:28.334878", "step": 2469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.365115", "step": 2469, "epoch": 3 }, { "type": "loss", "content": 0.007398658897727728, "timestamp": "2025-09-15 03:21:28.367525", "step": 2470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.402939", "step": 2470, "epoch": 3 }, { "type": "loss", "content": 0.0008656633435748518, "timestamp": "2025-09-15 03:21:28.407339", "step": 2471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.449135", "step": 2471, "epoch": 3 }, { "type": "loss", "content": 0.006925874389708042, "timestamp": "2025-09-15 03:21:28.472859", "step": 2472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.503149", "step": 2472, "epoch": 3 }, { "type": "loss", "content": 0.02251579985022545, "timestamp": "2025-09-15 03:21:28.505242", "step": 2473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.535784", "step": 2473, "epoch": 3 }, { "type": "loss", "content": 0.00035866329562850296, "timestamp": "2025-09-15 03:21:28.537832", "step": 2474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.568261", "step": 2474, "epoch": 3 }, { "type": "loss", "content": 0.0008645497146062553, "timestamp": "2025-09-15 03:21:28.578513", "step": 2475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.611000", "step": 2475, "epoch": 3 }, { "type": "loss", "content": 0.006888339761644602, "timestamp": "2025-09-15 03:21:28.634600", "step": 2476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.665409", "step": 2476, "epoch": 3 }, { "type": "loss", "content": 0.0005782764637842774, "timestamp": "2025-09-15 03:21:28.667861", "step": 2477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.709237", "step": 2477, "epoch": 3 }, { "type": "loss", "content": 0.022311091423034668, "timestamp": "2025-09-15 03:21:28.716463", "step": 2478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.747526", "step": 2478, "epoch": 3 }, { "type": "loss", "content": 0.0008715793373994529, "timestamp": "2025-09-15 03:21:28.749803", "step": 2479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.780909", "step": 2479, "epoch": 3 }, { "type": "loss", "content": 0.02877902425825596, "timestamp": "2025-09-15 03:21:28.804379", "step": 2480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.834846", "step": 2480, "epoch": 3 }, { "type": "loss", "content": 0.0014250698732212186, "timestamp": "2025-09-15 03:21:28.837078", "step": 2481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:28.868960", "step": 2481, "epoch": 3 }, { "type": "loss", "content": 0.00030158410663716495, "timestamp": "2025-09-15 03:21:28.871467", "step": 2482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:28.901685", "step": 2482, "epoch": 3 }, { "type": "loss", "content": 0.00030347550637088716, "timestamp": "2025-09-15 03:21:28.903831", "step": 2483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:28.934126", "step": 2483, "epoch": 3 }, { "type": "loss", "content": 0.00023485065321438015, "timestamp": "2025-09-15 03:21:28.957670", "step": 2484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:28.990306", "step": 2484, "epoch": 3 }, { "type": "loss", "content": 0.029425393790006638, "timestamp": "2025-09-15 03:21:28.992656", "step": 2485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.024933", "step": 2485, "epoch": 3 }, { "type": "loss", "content": 0.006918161641806364, "timestamp": "2025-09-15 03:21:29.027215", "step": 2486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.057954", "step": 2486, "epoch": 3 }, { "type": "loss", "content": 0.006680171005427837, "timestamp": "2025-09-15 03:21:29.060481", "step": 2487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.091190", "step": 2487, "epoch": 3 }, { "type": "loss", "content": 0.00960888247936964, "timestamp": "2025-09-15 03:21:29.114762", "step": 2488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:29.147338", "step": 2488, "epoch": 3 }, { "type": "loss", "content": 0.0015458361012861133, "timestamp": "2025-09-15 03:21:29.149530", "step": 2489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.180205", "step": 2489, "epoch": 3 }, { "type": "loss", "content": 0.0001561766694067046, "timestamp": "2025-09-15 03:21:29.182478", "step": 2490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.213049", "step": 2490, "epoch": 3 }, { "type": "loss", "content": 8.076949598034844e-05, "timestamp": "2025-09-15 03:21:29.218111", "step": 2491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:29.251496", "step": 2491, "epoch": 3 }, { "type": "loss", "content": 0.03969201073050499, "timestamp": "2025-09-15 03:21:29.275086", "step": 2492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:29.306324", "step": 2492, "epoch": 3 }, { "type": "loss", "content": 0.00016137374041136354, "timestamp": "2025-09-15 03:21:29.308586", "step": 2493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.339007", "step": 2493, "epoch": 3 }, { "type": "loss", "content": 0.0026815198361873627, "timestamp": "2025-09-15 03:21:29.341289", "step": 2494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.373253", "step": 2494, "epoch": 3 }, { "type": "loss", "content": 0.015436379238963127, "timestamp": "2025-09-15 03:21:29.375440", "step": 2495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.405720", "step": 2495, "epoch": 3 }, { "type": "loss", "content": 0.00012169930414529517, "timestamp": "2025-09-15 03:21:29.429266", "step": 2496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.459466", "step": 2496, "epoch": 3 }, { "type": "loss", "content": 0.017209528014063835, "timestamp": "2025-09-15 03:21:29.461950", "step": 2497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:29.493119", "step": 2497, "epoch": 3 }, { "type": "loss", "content": 0.002141644014045596, "timestamp": "2025-09-15 03:21:29.495513", "step": 2498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.528016", "step": 2498, "epoch": 3 }, { "type": "loss", "content": 0.01186156552284956, "timestamp": "2025-09-15 03:21:29.530376", "step": 2499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:29.561450", "step": 2499, "epoch": 3 }, { "type": "loss", "content": 0.0002478655078448355, "timestamp": "2025-09-15 03:21:29.585122", "step": 2500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2500", "timestamp": "2025-09-15 03:21:35.884158", "step": 2500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:35.930457", "step": 2500, "epoch": 3 }, { "type": "loss", "content": 0.0007553789764642715, "timestamp": "2025-09-15 03:21:35.932658", "step": 2501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:35.963635", "step": 2501, "epoch": 3 }, { "type": "loss", "content": 0.0018034816021099687, "timestamp": "2025-09-15 03:21:35.965881", "step": 2502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:35.996624", "step": 2502, "epoch": 3 }, { "type": "loss", "content": 0.013548840768635273, "timestamp": "2025-09-15 03:21:35.998793", "step": 2503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:36.029466", "step": 2503, "epoch": 3 }, { "type": "loss", "content": 0.03326854854822159, "timestamp": "2025-09-15 03:21:36.053312", "step": 2504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:36.083613", "step": 2504, "epoch": 3 }, { "type": "loss", "content": 0.0012910410296171904, "timestamp": "2025-09-15 03:21:36.085624", "step": 2505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:36.115702", "step": 2505, "epoch": 3 }, { "type": "loss", "content": 0.001528589054942131, "timestamp": "2025-09-15 03:21:36.117864", "step": 2506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:36.148799", "step": 2506, "epoch": 3 }, { "type": "loss", "content": 0.0031019821763038635, "timestamp": "2025-09-15 03:21:36.153507", "step": 2507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:36.185322", "step": 2507, "epoch": 3 }, { "type": "loss", "content": 0.026747560128569603, "timestamp": "2025-09-15 03:21:36.212291", "step": 2508, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:36.966405", "step": 2508, "epoch": 3 }, { "type": "pplx", "content": 43281895.896381795, "timestamp": "2025-09-15 03:21:36.968451", "step": 2508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:36.997379", "step": 2508, "epoch": 3 }, { "type": "loss", "content": 0.02395753376185894, "timestamp": "2025-09-15 03:21:36.999756", "step": 2509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.030864", "step": 2509, "epoch": 3 }, { "type": "loss", "content": 0.001957479165866971, "timestamp": "2025-09-15 03:21:37.033173", "step": 2510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.064203", "step": 2510, "epoch": 3 }, { "type": "loss", "content": 0.002216178458184004, "timestamp": "2025-09-15 03:21:37.066566", "step": 2511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.098731", "step": 2511, "epoch": 3 }, { "type": "loss", "content": 0.008040383458137512, "timestamp": "2025-09-15 03:21:37.122489", "step": 2512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.153206", "step": 2512, "epoch": 3 }, { "type": "loss", "content": 0.006515815854072571, "timestamp": "2025-09-15 03:21:37.155383", "step": 2513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.185781", "step": 2513, "epoch": 3 }, { "type": "loss", "content": 0.0028326960746198893, "timestamp": "2025-09-15 03:21:37.188124", "step": 2514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:37.218547", "step": 2514, "epoch": 3 }, { "type": "loss", "content": 0.007095829583704472, "timestamp": "2025-09-15 03:21:37.220693", "step": 2515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:37.251125", "step": 2515, "epoch": 3 }, { "type": "loss", "content": 0.012786582112312317, "timestamp": "2025-09-15 03:21:37.274672", "step": 2516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.305225", "step": 2516, "epoch": 3 }, { "type": "loss", "content": 0.027167638763785362, "timestamp": "2025-09-15 03:21:37.307299", "step": 2517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.338024", "step": 2517, "epoch": 3 }, { "type": "loss", "content": 0.005034402012825012, "timestamp": "2025-09-15 03:21:37.340126", "step": 2518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.370443", "step": 2518, "epoch": 3 }, { "type": "loss", "content": 0.030000852420926094, "timestamp": "2025-09-15 03:21:37.372524", "step": 2519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.403080", "step": 2519, "epoch": 3 }, { "type": "loss", "content": 0.006657686084508896, "timestamp": "2025-09-15 03:21:37.426596", "step": 2520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.457937", "step": 2520, "epoch": 3 }, { "type": "loss", "content": 0.01242148783057928, "timestamp": "2025-09-15 03:21:37.460252", "step": 2521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.491338", "step": 2521, "epoch": 3 }, { "type": "loss", "content": 0.0017653640825301409, "timestamp": "2025-09-15 03:21:37.493498", "step": 2522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.523332", "step": 2522, "epoch": 3 }, { "type": "loss", "content": 0.05272073671221733, "timestamp": "2025-09-15 03:21:37.525466", "step": 2523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.555762", "step": 2523, "epoch": 3 }, { "type": "loss", "content": 0.022209199145436287, "timestamp": "2025-09-15 03:21:37.579357", "step": 2524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.611390", "step": 2524, "epoch": 3 }, { "type": "loss", "content": 0.001376944943331182, "timestamp": "2025-09-15 03:21:37.613464", "step": 2525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.644050", "step": 2525, "epoch": 3 }, { "type": "loss", "content": 0.004452253691852093, "timestamp": "2025-09-15 03:21:37.646248", "step": 2526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.676967", "step": 2526, "epoch": 3 }, { "type": "loss", "content": 0.00443139998242259, "timestamp": "2025-09-15 03:21:37.679186", "step": 2527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:37.710003", "step": 2527, "epoch": 3 }, { "type": "loss", "content": 0.01706121675670147, "timestamp": "2025-09-15 03:21:37.733427", "step": 2528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.764700", "step": 2528, "epoch": 3 }, { "type": "loss", "content": 0.0020160474814474583, "timestamp": "2025-09-15 03:21:37.766793", "step": 2529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:37.797458", "step": 2529, "epoch": 3 }, { "type": "loss", "content": 0.00645886966958642, "timestamp": "2025-09-15 03:21:37.799619", "step": 2530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.829867", "step": 2530, "epoch": 3 }, { "type": "loss", "content": 0.004868703428655863, "timestamp": "2025-09-15 03:21:37.832096", "step": 2531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.862932", "step": 2531, "epoch": 3 }, { "type": "loss", "content": 0.0013749711215496063, "timestamp": "2025-09-15 03:21:37.886649", "step": 2532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.919169", "step": 2532, "epoch": 3 }, { "type": "loss", "content": 0.01655150018632412, "timestamp": "2025-09-15 03:21:37.921663", "step": 2533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:37.951927", "step": 2533, "epoch": 3 }, { "type": "loss", "content": 0.010359219275414944, "timestamp": "2025-09-15 03:21:37.954587", "step": 2534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:37.984742", "step": 2534, "epoch": 3 }, { "type": "loss", "content": 0.0017317746533080935, "timestamp": "2025-09-15 03:21:37.986790", "step": 2535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.017483", "step": 2535, "epoch": 3 }, { "type": "loss", "content": 0.002996358321979642, "timestamp": "2025-09-15 03:21:38.040977", "step": 2536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.071618", "step": 2536, "epoch": 3 }, { "type": "loss", "content": 0.006995683070272207, "timestamp": "2025-09-15 03:21:38.073837", "step": 2537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:38.104907", "step": 2537, "epoch": 3 }, { "type": "loss", "content": 0.008663954213261604, "timestamp": "2025-09-15 03:21:38.106977", "step": 2538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:38.137751", "step": 2538, "epoch": 3 }, { "type": "loss", "content": 0.011092414148151875, "timestamp": "2025-09-15 03:21:38.139837", "step": 2539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:38.170667", "step": 2539, "epoch": 3 }, { "type": "loss", "content": 0.00014628804638050497, "timestamp": "2025-09-15 03:21:38.194307", "step": 2540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.225229", "step": 2540, "epoch": 3 }, { "type": "loss", "content": 0.00014245144848246127, "timestamp": "2025-09-15 03:21:38.227552", "step": 2541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.260307", "step": 2541, "epoch": 3 }, { "type": "loss", "content": 0.0020488626323640347, "timestamp": "2025-09-15 03:21:38.262358", "step": 2542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.293162", "step": 2542, "epoch": 3 }, { "type": "loss", "content": 0.002959874924272299, "timestamp": "2025-09-15 03:21:38.295284", "step": 2543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.325618", "step": 2543, "epoch": 3 }, { "type": "loss", "content": 0.02810373343527317, "timestamp": "2025-09-15 03:21:38.349107", "step": 2544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:38.379939", "step": 2544, "epoch": 3 }, { "type": "loss", "content": 0.001163217588327825, "timestamp": "2025-09-15 03:21:38.382234", "step": 2545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.412962", "step": 2545, "epoch": 3 }, { "type": "loss", "content": 0.0035799711477011442, "timestamp": "2025-09-15 03:21:38.415091", "step": 2546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.446458", "step": 2546, "epoch": 3 }, { "type": "loss", "content": 0.0019874710123986006, "timestamp": "2025-09-15 03:21:38.448666", "step": 2547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.480161", "step": 2547, "epoch": 3 }, { "type": "loss", "content": 0.0012459383578971028, "timestamp": "2025-09-15 03:21:38.503874", "step": 2548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:38.534927", "step": 2548, "epoch": 3 }, { "type": "loss", "content": 0.010002275928854942, "timestamp": "2025-09-15 03:21:38.537134", "step": 2549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.567964", "step": 2549, "epoch": 3 }, { "type": "loss", "content": 0.016585873439908028, "timestamp": "2025-09-15 03:21:38.570108", "step": 2550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.601069", "step": 2550, "epoch": 3 }, { "type": "loss", "content": 0.02941265143454075, "timestamp": "2025-09-15 03:21:38.604959", "step": 2551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.635337", "step": 2551, "epoch": 3 }, { "type": "loss", "content": 0.006005280185490847, "timestamp": "2025-09-15 03:21:38.659938", "step": 2552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.692205", "step": 2552, "epoch": 3 }, { "type": "loss", "content": 0.0008412003517150879, "timestamp": "2025-09-15 03:21:38.694480", "step": 2553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:38.724620", "step": 2553, "epoch": 3 }, { "type": "loss", "content": 0.03075595758855343, "timestamp": "2025-09-15 03:21:38.727017", "step": 2554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.757887", "step": 2554, "epoch": 3 }, { "type": "loss", "content": 5.689586396329105e-05, "timestamp": "2025-09-15 03:21:38.760115", "step": 2555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.791445", "step": 2555, "epoch": 3 }, { "type": "loss", "content": 0.000128539526485838, "timestamp": "2025-09-15 03:21:38.815082", "step": 2556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.846010", "step": 2556, "epoch": 3 }, { "type": "loss", "content": 0.0015599167672917247, "timestamp": "2025-09-15 03:21:38.848241", "step": 2557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.878543", "step": 2557, "epoch": 3 }, { "type": "loss", "content": 0.030791833996772766, "timestamp": "2025-09-15 03:21:38.880768", "step": 2558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:38.911233", "step": 2558, "epoch": 3 }, { "type": "loss", "content": 0.007090753875672817, "timestamp": "2025-09-15 03:21:38.913565", "step": 2559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:38.944558", "step": 2559, "epoch": 3 }, { "type": "loss", "content": 0.0018526233034208417, "timestamp": "2025-09-15 03:21:38.968024", "step": 2560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:38.998581", "step": 2560, "epoch": 3 }, { "type": "loss", "content": 0.0008933954522944987, "timestamp": "2025-09-15 03:21:39.000671", "step": 2561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:39.031172", "step": 2561, "epoch": 3 }, { "type": "loss", "content": 0.00965001992881298, "timestamp": "2025-09-15 03:21:39.033302", "step": 2562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:39.063769", "step": 2562, "epoch": 3 }, { "type": "loss", "content": 0.0011859569931402802, "timestamp": "2025-09-15 03:21:39.065844", "step": 2563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:39.096584", "step": 2563, "epoch": 3 }, { "type": "loss", "content": 0.003663228126242757, "timestamp": "2025-09-15 03:21:39.120260", "step": 2564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:39.150962", "step": 2564, "epoch": 3 }, { "type": "loss", "content": 0.0022224951535463333, "timestamp": "2025-09-15 03:21:39.153033", "step": 2565, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:39.883862", "step": 2565, "epoch": 3 }, { "type": "pplx", "content": 45366570.01248686, "timestamp": "2025-09-15 03:21:39.885759", "step": 2565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:39.914331", "step": 2565, "epoch": 3 }, { "type": "loss", "content": 0.006390959955751896, "timestamp": "2025-09-15 03:21:39.916645", "step": 2566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:39.947894", "step": 2566, "epoch": 3 }, { "type": "loss", "content": 0.0008488258463330567, "timestamp": "2025-09-15 03:21:39.950081", "step": 2567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:39.982017", "step": 2567, "epoch": 3 }, { "type": "loss", "content": 0.0018424444133415818, "timestamp": "2025-09-15 03:21:40.005688", "step": 2568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.036536", "step": 2568, "epoch": 3 }, { "type": "loss", "content": 0.0006706213462166488, "timestamp": "2025-09-15 03:21:40.038931", "step": 2569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.069590", "step": 2569, "epoch": 3 }, { "type": "loss", "content": 0.012076971121132374, "timestamp": "2025-09-15 03:21:40.071622", "step": 2570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.103073", "step": 2570, "epoch": 3 }, { "type": "loss", "content": 0.004673096816986799, "timestamp": "2025-09-15 03:21:40.107358", "step": 2571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.140778", "step": 2571, "epoch": 3 }, { "type": "loss", "content": 0.002782629569992423, "timestamp": "2025-09-15 03:21:40.164974", "step": 2572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.196143", "step": 2572, "epoch": 3 }, { "type": "loss", "content": 0.000997567898593843, "timestamp": "2025-09-15 03:21:40.198180", "step": 2573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.228489", "step": 2573, "epoch": 3 }, { "type": "loss", "content": 0.005449178162962198, "timestamp": "2025-09-15 03:21:40.230719", "step": 2574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:40.262528", "step": 2574, "epoch": 3 }, { "type": "loss", "content": 0.004305838141590357, "timestamp": "2025-09-15 03:21:40.264752", "step": 2575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.295855", "step": 2575, "epoch": 3 }, { "type": "loss", "content": 0.03190882131457329, "timestamp": "2025-09-15 03:21:40.319488", "step": 2576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.349814", "step": 2576, "epoch": 3 }, { "type": "loss", "content": 0.0013857169542461634, "timestamp": "2025-09-15 03:21:40.351941", "step": 2577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:40.387765", "step": 2577, "epoch": 3 }, { "type": "loss", "content": 0.003061884781345725, "timestamp": "2025-09-15 03:21:40.391136", "step": 2578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.424284", "step": 2578, "epoch": 3 }, { "type": "loss", "content": 0.004191776271909475, "timestamp": "2025-09-15 03:21:40.428273", "step": 2579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.458953", "step": 2579, "epoch": 3 }, { "type": "loss", "content": 0.008370229043066502, "timestamp": "2025-09-15 03:21:40.482653", "step": 2580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.534777", "step": 2580, "epoch": 3 }, { "type": "loss", "content": 0.0011418802896514535, "timestamp": "2025-09-15 03:21:40.537012", "step": 2581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.567881", "step": 2581, "epoch": 3 }, { "type": "loss", "content": 0.003366305958479643, "timestamp": "2025-09-15 03:21:40.570250", "step": 2582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.601645", "step": 2582, "epoch": 3 }, { "type": "loss", "content": 0.001745986519381404, "timestamp": "2025-09-15 03:21:40.606397", "step": 2583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.637319", "step": 2583, "epoch": 3 }, { "type": "loss", "content": 0.0017125660087913275, "timestamp": "2025-09-15 03:21:40.660787", "step": 2584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.692077", "step": 2584, "epoch": 3 }, { "type": "loss", "content": 0.0020126174204051495, "timestamp": "2025-09-15 03:21:40.694163", "step": 2585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.732749", "step": 2585, "epoch": 3 }, { "type": "loss", "content": 0.031251564621925354, "timestamp": "2025-09-15 03:21:40.734854", "step": 2586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.765176", "step": 2586, "epoch": 3 }, { "type": "loss", "content": 0.0018158911261707544, "timestamp": "2025-09-15 03:21:40.767609", "step": 2587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:40.798034", "step": 2587, "epoch": 3 }, { "type": "loss", "content": 0.002623507520183921, "timestamp": "2025-09-15 03:21:40.821425", "step": 2588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:40.865134", "step": 2588, "epoch": 3 }, { "type": "loss", "content": 0.003804678563028574, "timestamp": "2025-09-15 03:21:40.867921", "step": 2589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.898236", "step": 2589, "epoch": 3 }, { "type": "loss", "content": 0.0029680945444852114, "timestamp": "2025-09-15 03:21:40.900375", "step": 2590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:40.930968", "step": 2590, "epoch": 3 }, { "type": "loss", "content": 0.006071859505027533, "timestamp": "2025-09-15 03:21:40.934342", "step": 2591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:40.966265", "step": 2591, "epoch": 3 }, { "type": "loss", "content": 0.0016739203128963709, "timestamp": "2025-09-15 03:21:40.990123", "step": 2592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:41.020829", "step": 2592, "epoch": 3 }, { "type": "loss", "content": 0.0012125660432502627, "timestamp": "2025-09-15 03:21:41.022933", "step": 2593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.052891", "step": 2593, "epoch": 3 }, { "type": "loss", "content": 0.0016867046942934394, "timestamp": "2025-09-15 03:21:41.055090", "step": 2594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.085592", "step": 2594, "epoch": 3 }, { "type": "loss", "content": 0.0024879786651581526, "timestamp": "2025-09-15 03:21:41.088119", "step": 2595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-15 03:21:41.119045", "step": 2595, "epoch": 3 }, { "type": "loss", "content": 0.01448439247906208, "timestamp": "2025-09-15 03:21:41.142739", "step": 2596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.172775", "step": 2596, "epoch": 3 }, { "type": "loss", "content": 0.00893432553857565, "timestamp": "2025-09-15 03:21:41.174925", "step": 2597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.205102", "step": 2597, "epoch": 3 }, { "type": "loss", "content": 0.003067398676648736, "timestamp": "2025-09-15 03:21:41.207196", "step": 2598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.237417", "step": 2598, "epoch": 3 }, { "type": "loss", "content": 0.001361220725812018, "timestamp": "2025-09-15 03:21:41.239563", "step": 2599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.270230", "step": 2599, "epoch": 3 }, { "type": "loss", "content": 0.0022079572081565857, "timestamp": "2025-09-15 03:21:41.293871", "step": 2600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.324734", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.0025524452794343233, "timestamp": "2025-09-15 03:21:41.327302", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.357904", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.0015212270664051175, "timestamp": "2025-09-15 03:21:41.359965", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.390604", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.00080554757732898, "timestamp": "2025-09-15 03:21:41.392758", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.424261", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.0060416231863200665, "timestamp": "2025-09-15 03:21:41.447677", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:41.478527", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.008705796673893929, "timestamp": "2025-09-15 03:21:41.480695", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:41.510965", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.00011078565148636699, "timestamp": "2025-09-15 03:21:41.513295", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.543566", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.031115766614675522, "timestamp": "2025-09-15 03:21:41.545946", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.576610", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.0001265882165171206, "timestamp": "2025-09-15 03:21:41.600105", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:41.631427", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.0014504080172628164, "timestamp": "2025-09-15 03:21:41.633831", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:41.665216", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.0015388702740892768, "timestamp": "2025-09-15 03:21:41.667566", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:41.698328", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.0017330830451101065, "timestamp": "2025-09-15 03:21:41.700618", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.730813", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.0002649607486091554, "timestamp": "2025-09-15 03:21:41.754623", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:41.785691", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.04479163885116577, "timestamp": "2025-09-15 03:21:41.787879", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.818645", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.00031301137642003596, "timestamp": "2025-09-15 03:21:41.820748", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.851131", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.001099871238693595, "timestamp": "2025-09-15 03:21:41.853299", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:41.884701", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.0005943683790974319, "timestamp": "2025-09-15 03:21:41.908528", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:41.940357", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.013975650072097778, "timestamp": "2025-09-15 03:21:41.942502", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:41.973872", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.00039200225728563964, "timestamp": "2025-09-15 03:21:41.976310", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:42.011271", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 9.524152847006917e-05, "timestamp": "2025-09-15 03:21:42.013447", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:42.043877", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 2.9366343369474635e-05, "timestamp": "2025-09-15 03:21:42.067400", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:42.098437", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 2.4567301807110198e-05, "timestamp": "2025-09-15 03:21:42.100541", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:42.131107", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.006696456111967564, "timestamp": "2025-09-15 03:21:42.133346", "step": 2622, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:42.863649", "step": 2622, "epoch": 3 }, { "type": "pplx", "content": 60060753.57362444, "timestamp": "2025-09-15 03:21:42.865790", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:42.895445", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.00233951723203063, "timestamp": "2025-09-15 03:21:42.897760", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:42.928491", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 3.5728735383599997e-05, "timestamp": "2025-09-15 03:21:42.952419", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:42.983760", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.01346229575574398, "timestamp": "2025-09-15 03:21:42.985857", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.017087", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.004266827367246151, "timestamp": "2025-09-15 03:21:43.019134", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.049560", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.0003306028083898127, "timestamp": "2025-09-15 03:21:43.051705", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.082198", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.008768661879003048, "timestamp": "2025-09-15 03:21:43.105873", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.136309", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.00028717468376271427, "timestamp": "2025-09-15 03:21:43.138277", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.168707", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.00015357838128693402, "timestamp": "2025-09-15 03:21:43.170817", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.201700", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.0003812172799371183, "timestamp": "2025-09-15 03:21:43.204301", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:43.235283", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.011865836568176746, "timestamp": "2025-09-15 03:21:43.259045", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.290095", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.00015511747915297747, "timestamp": "2025-09-15 03:21:43.292317", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.325492", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.0004173486668150872, "timestamp": "2025-09-15 03:21:43.327619", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.358780", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.001391603727824986, "timestamp": "2025-09-15 03:21:43.361082", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:43.392549", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.010834120213985443, "timestamp": "2025-09-15 03:21:43.416218", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.446424", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.0010919544147327542, "timestamp": "2025-09-15 03:21:43.448779", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.478725", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.00014520598051603884, "timestamp": "2025-09-15 03:21:43.480866", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.511478", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.0008089410257525742, "timestamp": "2025-09-15 03:21:43.513671", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.543782", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.0015242323279380798, "timestamp": "2025-09-15 03:21:43.567474", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.598663", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.00014808375271968544, "timestamp": "2025-09-15 03:21:43.600673", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:43.632624", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.00033712037838995457, "timestamp": "2025-09-15 03:21:43.634965", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:43.665508", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.0004964851541444659, "timestamp": "2025-09-15 03:21:43.669020", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:43.699348", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.001874964451417327, "timestamp": "2025-09-15 03:21:43.722884", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.753500", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.00029709740192629397, "timestamp": "2025-09-15 03:21:43.755674", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.785929", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.0006256395718082786, "timestamp": "2025-09-15 03:21:43.788278", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.818871", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.01808498241007328, "timestamp": "2025-09-15 03:21:43.821063", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:43.851012", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.001829305081628263, "timestamp": "2025-09-15 03:21:43.874891", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:43.906591", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.0001833066053222865, "timestamp": "2025-09-15 03:21:43.908998", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:43.939599", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.00026104258722625673, "timestamp": "2025-09-15 03:21:43.942064", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:43.972713", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.00035324011696502566, "timestamp": "2025-09-15 03:21:43.975541", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.006336", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.0006148394313640893, "timestamp": "2025-09-15 03:21:44.029899", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.061125", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.00014098930114414543, "timestamp": "2025-09-15 03:21:44.063268", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.093846", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.00035440208739601076, "timestamp": "2025-09-15 03:21:44.096056", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:44.127747", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.0009488330106250942, "timestamp": "2025-09-15 03:21:44.129929", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.161315", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.0007009514956735075, "timestamp": "2025-09-15 03:21:44.184869", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.215528", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.0003025685145985335, "timestamp": "2025-09-15 03:21:44.217657", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:44.248413", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.0018814911600202322, "timestamp": "2025-09-15 03:21:44.250676", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:44.281706", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.00033558739232830703, "timestamp": "2025-09-15 03:21:44.283788", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.314900", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.00040636741323396564, "timestamp": "2025-09-15 03:21:44.338398", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.368751", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.00021982923499308527, "timestamp": "2025-09-15 03:21:44.370940", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.402560", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.0003634126915130764, "timestamp": "2025-09-15 03:21:44.404651", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.434809", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.00037710348260588944, "timestamp": "2025-09-15 03:21:44.437307", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.467423", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.009560467675328255, "timestamp": "2025-09-15 03:21:44.491184", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.522005", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.00657200813293457, "timestamp": "2025-09-15 03:21:44.524383", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.555113", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.00010232098429696634, "timestamp": "2025-09-15 03:21:44.557346", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.588029", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.00013133182073943317, "timestamp": "2025-09-15 03:21:44.590241", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:44.621159", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.013489065691828728, "timestamp": "2025-09-15 03:21:44.644643", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.675978", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.0016374537954106927, "timestamp": "2025-09-15 03:21:44.678154", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.710450", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.004283093381673098, "timestamp": "2025-09-15 03:21:44.712627", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:44.744147", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 7.831586844986305e-05, "timestamp": "2025-09-15 03:21:44.746383", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.777353", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.00014679950254503638, "timestamp": "2025-09-15 03:21:44.801185", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.831635", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.002847463358193636, "timestamp": "2025-09-15 03:21:44.833801", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:44.864556", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 7.834843563614413e-05, "timestamp": "2025-09-15 03:21:44.866636", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.897191", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.0002578691055532545, "timestamp": "2025-09-15 03:21:44.899298", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.930149", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.0002215011336375028, "timestamp": "2025-09-15 03:21:44.953730", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:44.984333", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.0002951612987089902, "timestamp": "2025-09-15 03:21:44.987270", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:45.017372", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.0018803089624270797, "timestamp": "2025-09-15 03:21:45.019468", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:45.050064", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.00012660605716519058, "timestamp": "2025-09-15 03:21:45.052417", "step": 2679, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:45.785186", "step": 2679, "epoch": 3 }, { "type": "pplx", "content": 68281292.55607095, "timestamp": "2025-09-15 03:21:45.787200", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:45.816800", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.0001412067358614877, "timestamp": "2025-09-15 03:21:45.840625", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:45.872075", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.0005017804214730859, "timestamp": "2025-09-15 03:21:45.874161", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:45.905325", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.0001562709512654692, "timestamp": "2025-09-15 03:21:45.907477", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:45.938281", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 5.530984708457254e-05, "timestamp": "2025-09-15 03:21:45.948906", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:45.987811", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.0002064589352812618, "timestamp": "2025-09-15 03:21:46.011675", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.042031", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 4.4304611947154626e-05, "timestamp": "2025-09-15 03:21:46.044071", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:46.074494", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.00019901491759810597, "timestamp": "2025-09-15 03:21:46.076772", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.107441", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.0010612963233143091, "timestamp": "2025-09-15 03:21:46.109567", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:46.140285", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.0007707072654739022, "timestamp": "2025-09-15 03:21:46.163862", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.194353", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 9.548853995511308e-05, "timestamp": "2025-09-15 03:21:46.196595", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.227286", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 8.759932825341821e-05, "timestamp": "2025-09-15 03:21:46.229406", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.261499", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.00026428294950164855, "timestamp": "2025-09-15 03:21:46.264240", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:46.295436", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.0005423675174824893, "timestamp": "2025-09-15 03:21:46.319207", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.349665", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 6.664798274869099e-05, "timestamp": "2025-09-15 03:21:46.352016", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.382850", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.00035035648033954203, "timestamp": "2025-09-15 03:21:46.385436", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.417601", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.0007025161758065224, "timestamp": "2025-09-15 03:21:46.419788", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:46.450857", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.0002691586851142347, "timestamp": "2025-09-15 03:21:46.474565", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.505154", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.00010667191963875666, "timestamp": "2025-09-15 03:21:46.507460", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.537799", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.00014287196972873062, "timestamp": "2025-09-15 03:21:46.540096", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.570506", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.004734222777187824, "timestamp": "2025-09-15 03:21:46.572714", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:46.603948", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 9.962467447621748e-05, "timestamp": "2025-09-15 03:21:46.628968", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.659915", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.00013182398106437176, "timestamp": "2025-09-15 03:21:46.663241", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.694487", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 8.584625174989924e-05, "timestamp": "2025-09-15 03:21:46.696182", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:46.726822", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.00016575524932704866, "timestamp": "2025-09-15 03:21:46.728860", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.759112", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 9.599862096365541e-05, "timestamp": "2025-09-15 03:21:46.782761", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.814339", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.0006985267391428351, "timestamp": "2025-09-15 03:21:46.817011", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.848214", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.0006570377154275775, "timestamp": "2025-09-15 03:21:46.850706", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.881338", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 5.436737774289213e-05, "timestamp": "2025-09-15 03:21:46.883412", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.913573", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.00018206385720986873, "timestamp": "2025-09-15 03:21:46.937354", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:46.969934", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.00015278191131073982, "timestamp": "2025-09-15 03:21:46.972011", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.002584", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.00026051508029922843, "timestamp": "2025-09-15 03:21:47.004840", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.035438", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 7.116630877135321e-05, "timestamp": "2025-09-15 03:21:47.037632", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.069438", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 4.763676770380698e-05, "timestamp": "2025-09-15 03:21:47.092859", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.124309", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.0005188211798667908, "timestamp": "2025-09-15 03:21:47.126479", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:47.157195", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.014473472721874714, "timestamp": "2025-09-15 03:21:47.159377", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.190566", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.022033294662833214, "timestamp": "2025-09-15 03:21:47.192938", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.223935", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 8.771099237492308e-05, "timestamp": "2025-09-15 03:21:47.247565", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.279232", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.0009455090621486306, "timestamp": "2025-09-15 03:21:47.282112", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.313133", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 2.2676438675262034e-05, "timestamp": "2025-09-15 03:21:47.316036", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.346397", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.000588989001698792, "timestamp": "2025-09-15 03:21:47.348461", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.379888", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.003522381419315934, "timestamp": "2025-09-15 03:21:47.403617", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.434835", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 9.874381794361398e-05, "timestamp": "2025-09-15 03:21:47.437366", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.468699", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 6.181753269629553e-05, "timestamp": "2025-09-15 03:21:47.470831", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.503234", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.021309752017259598, "timestamp": "2025-09-15 03:21:47.505418", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.536114", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 5.099936606711708e-05, "timestamp": "2025-09-15 03:21:47.559681", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.590837", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.019538061693310738, "timestamp": "2025-09-15 03:21:47.593050", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.623804", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.010590228252112865, "timestamp": "2025-09-15 03:21:47.625964", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.656938", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.0008069784962572157, "timestamp": "2025-09-15 03:21:47.659467", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.690899", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.0004390797985251993, "timestamp": "2025-09-15 03:21:47.716573", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.748630", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.0014442296233028173, "timestamp": "2025-09-15 03:21:47.750823", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.781695", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.0001858282630564645, "timestamp": "2025-09-15 03:21:47.784050", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.815812", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 8.658268779981881e-05, "timestamp": "2025-09-15 03:21:47.817992", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.850328", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.0170897264033556, "timestamp": "2025-09-15 03:21:47.873838", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.905086", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.00023263516777660698, "timestamp": "2025-09-15 03:21:47.907703", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:47.938197", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.00016691691416781396, "timestamp": "2025-09-15 03:21:47.940489", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:47.971067", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 5.8849462220678106e-05, "timestamp": "2025-09-15 03:21:47.973257", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.004099", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.018125424161553383, "timestamp": "2025-09-15 03:21:48.027968", "step": 2736, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:48.761944", "step": 2736, "epoch": 3 }, { "type": "pplx", "content": 67706275.31001619, "timestamp": "2025-09-15 03:21:48.764361", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.794358", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.00023372893338091671, "timestamp": "2025-09-15 03:21:48.796425", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.826981", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.00027472517103888094, "timestamp": "2025-09-15 03:21:48.829412", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.861155", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.0010784949408844113, "timestamp": "2025-09-15 03:21:48.863355", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.895491", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.00026522178086452186, "timestamp": "2025-09-15 03:21:48.919244", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:48.949712", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.0005690989783033729, "timestamp": "2025-09-15 03:21:48.951854", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:48.983057", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.003584182122722268, "timestamp": "2025-09-15 03:21:48.985187", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:49.016147", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.01012119185179472, "timestamp": "2025-09-15 03:21:49.018323", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:49.049155", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.008594135753810406, "timestamp": "2025-09-15 03:21:49.072888", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:49.103719", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.0050062802620232105, "timestamp": "2025-09-15 03:21:49.106022", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:49.138939", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.023680929094552994, "timestamp": "2025-09-15 03:21:49.141087", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:49.171554", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.0008458025404252112, "timestamp": "2025-09-15 03:21:49.173832", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-15 03:21:49.204668", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.014508053660392761, "timestamp": "2025-09-15 03:21:49.228490", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:49.259521", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.002385776722803712, "timestamp": "2025-09-15 03:21:49.261831", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-15 03:21:49.292328", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.04963759332895279, "timestamp": "2025-09-15 03:21:49.294679", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-15 03:21:49.326062", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.0025882285553961992, "timestamp": "2025-09-15 03:21:49.328510", "step": 2751, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-15 03:21:50.073511", "step": 2751, "epoch": 3 }, { "type": "pplx", "content": 65059509.345948905, "timestamp": "2025-09-15 03:21:50.075790", "step": 2751, "epoch": 3 }, { "type": "best_pplx", "content": 43281895.896381795, "timestamp": "2025-09-15 03:21:50.077423", "step": 2751, "epoch": 3 }, { "type": "best_step", "content": 2508, "timestamp": "2025-09-15 03:21:50.078993", "step": 2751, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5014951860256000, "timestamp": "2025-09-15 03:21:50.080603", "step": 2751, "epoch": 3 }, { "type": "total_train_flops", "content": 10640863719936576, "timestamp": "2025-09-15 03:21:50.082991", "step": 2751, "epoch": 3 } ], "best_evals": { "pplx": { "score": 43281895.896381795, "step": 2508 }, "rougel": { "precision": 0.8284313725490197, "recall": 0.8284313725490197, "fmeasure": 0.8284313725490197 } } }