{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_ff_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_ff_v1/runs/Sep30_22-09-04_gx13", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 57, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_2/nlu_mrpc_ff_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": null, "flops": { "eval": 5014951860256000, "train": 10640863719936576, "total": 15655815580192576 }, "total": { "total": 58401.35561, "train": 45370.694260000004, "eval": 13030.661350000002 }, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:11.410034", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 226674977.87649825, "timestamp": "2025-09-30 22:09:11.415731", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:11.493702", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.7057779431343079, "timestamp": "2025-09-30 22:09:11.497137", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.558276", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.6982383131980896, "timestamp": "2025-09-30 22:09:11.561370", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.608783", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.7418850064277649, "timestamp": "2025-09-30 22:09:11.611339", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.652352", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.7169809341430664, "timestamp": "2025-09-30 22:09:11.740181", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.776307", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.1316017359495163, "timestamp": "2025-09-30 22:09:11.786063", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.819994", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.1368536502122879, "timestamp": "2025-09-30 22:09:11.828623", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:11.862287", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.13541023433208466, "timestamp": "2025-09-30 22:09:11.866684", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.905650", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.1483638435602188, "timestamp": "2025-09-30 22:09:11.930916", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:11.966263", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.005413126666098833, "timestamp": "2025-09-30 22:09:11.969115", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.002222", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.05680262669920921, "timestamp": "2025-09-30 22:09:12.005936", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.053094", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.025625307112932205, "timestamp": "2025-09-30 22:09:12.056458", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.090015", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.006495438050478697, "timestamp": "2025-09-30 22:09:12.115606", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.154093", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.02278040535748005, "timestamp": "2025-09-30 22:09:12.156747", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.190335", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.03420593962073326, "timestamp": "2025-09-30 22:09:12.198560", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.241662", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.026615411043167114, "timestamp": "2025-09-30 22:09:12.251123", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.289251", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.02100050076842308, "timestamp": "2025-09-30 22:09:12.319233", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.350524", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.05153878405690193, "timestamp": "2025-09-30 22:09:12.353778", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.389672", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.04646531492471695, "timestamp": "2025-09-30 22:09:12.394417", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:12.438804", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.03298034518957138, "timestamp": "2025-09-30 22:09:12.441980", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.476772", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.030850352719426155, "timestamp": "2025-09-30 22:09:12.502012", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:12.558330", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.0381910540163517, "timestamp": "2025-09-30 22:09:12.561275", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.594243", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.04014469310641289, "timestamp": "2025-09-30 22:09:12.598636", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.637101", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.02601078525185585, "timestamp": "2025-09-30 22:09:12.639563", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.674795", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.021312592551112175, "timestamp": "2025-09-30 22:09:12.707547", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.753705", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.024147259071469307, "timestamp": "2025-09-30 22:09:12.757263", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:12.813553", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.03936280682682991, "timestamp": "2025-09-30 22:09:12.818761", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:12.861309", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.02394765429198742, "timestamp": "2025-09-30 22:09:12.865361", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.909284", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.023970751091837883, "timestamp": "2025-09-30 22:09:12.934619", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:12.972121", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.03464008867740631, "timestamp": "2025-09-30 22:09:12.975501", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.019468", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.022118892520666122, "timestamp": "2025-09-30 22:09:13.022793", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.057302", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.02154598757624626, "timestamp": "2025-09-30 22:09:13.060719", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.096414", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.021323371678590775, "timestamp": "2025-09-30 22:09:13.121348", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.165154", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.018217450007796288, "timestamp": "2025-09-30 22:09:13.168824", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.213240", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.031157325953245163, "timestamp": "2025-09-30 22:09:13.224499", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.266485", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.016447557136416435, "timestamp": "2025-09-30 22:09:13.271677", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.304897", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.02151617407798767, "timestamp": "2025-09-30 22:09:13.337755", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.372624", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.017450835555791855, "timestamp": "2025-09-30 22:09:13.375245", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.407363", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.02037977986037731, "timestamp": "2025-09-30 22:09:13.410733", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:13.445573", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.016287634149193764, "timestamp": "2025-09-30 22:09:13.450012", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.492970", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.016756046563386917, "timestamp": "2025-09-30 22:09:13.517671", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.556627", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.009105638600885868, "timestamp": "2025-09-30 22:09:13.560667", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:13.595034", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.018572982400655746, "timestamp": "2025-09-30 22:09:13.605675", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.654795", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.03447669371962547, "timestamp": "2025-09-30 22:09:13.665573", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:13.715391", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.00934014655649662, "timestamp": "2025-09-30 22:09:13.740834", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.786780", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.0063931881450116634, "timestamp": "2025-09-30 22:09:13.789911", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.826157", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.006094901356846094, "timestamp": "2025-09-30 22:09:13.829873", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.864380", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.03661187365651131, "timestamp": "2025-09-30 22:09:13.873107", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.911041", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.0199322160333395, "timestamp": "2025-09-30 22:09:13.935829", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:13.969844", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.003671335754916072, "timestamp": "2025-09-30 22:09:13.972300", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:14.004074", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.043977946043014526, "timestamp": "2025-09-30 22:09:14.006632", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:14.042244", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.0037198185455054045, "timestamp": "2025-09-30 22:09:14.044979", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:14.080140", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.0373804084956646, "timestamp": "2025-09-30 22:09:14.104935", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:14.137673", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.03821820765733719, "timestamp": "2025-09-30 22:09:14.140018", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:14.171155", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.04410596936941147, "timestamp": "2025-09-30 22:09:14.180328", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:14.214887", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.024004271253943443, "timestamp": "2025-09-30 22:09:14.223302", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:14.261834", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.042835354804992676, "timestamp": "2025-09-30 22:09:14.287079", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:14.322256", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.02183734066784382, "timestamp": "2025-09-30 22:09:14.324668", "step": 57, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:15.277577", "step": 57, "epoch": 1 }, { "type": "pplx", "content": 50960626.10407889, "timestamp": "2025-09-30 22:09:15.280389", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.313721", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.020511453971266747, "timestamp": "2025-09-30 22:09:15.320925", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.358210", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.015296747907996178, "timestamp": "2025-09-30 22:09:15.364228", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.400614", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.019934866577386856, "timestamp": "2025-09-30 22:09:15.424173", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.460132", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.018061354756355286, "timestamp": "2025-09-30 22:09:15.467461", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:15.508520", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.02194632776081562, "timestamp": "2025-09-30 22:09:15.510931", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:15.556104", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.009745917282998562, "timestamp": "2025-09-30 22:09:15.560689", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.596293", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.009664694778621197, "timestamp": "2025-09-30 22:09:15.620651", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.654839", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.01291805598884821, "timestamp": "2025-09-30 22:09:15.657181", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:15.696461", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.026677941903471947, "timestamp": "2025-09-30 22:09:15.698887", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.733022", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.014336864463984966, "timestamp": "2025-09-30 22:09:15.739349", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.778181", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.025222107768058777, "timestamp": "2025-09-30 22:09:15.802313", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:15.840896", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.028957298025488853, "timestamp": "2025-09-30 22:09:15.843184", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.880458", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.01850477233529091, "timestamp": "2025-09-30 22:09:15.886068", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.918627", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.01517259981483221, "timestamp": "2025-09-30 22:09:15.924044", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:15.973451", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.02591550536453724, "timestamp": "2025-09-30 22:09:15.999792", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:16.035417", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.01952301897108555, "timestamp": "2025-09-30 22:09:16.038792", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.071567", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.02474440075457096, "timestamp": "2025-09-30 22:09:16.074385", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.105987", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.01817135512828827, "timestamp": "2025-09-30 22:09:16.113633", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:16.144323", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.021403932943940163, "timestamp": "2025-09-30 22:09:16.168243", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.203430", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.02099829539656639, "timestamp": "2025-09-30 22:09:16.208170", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.243817", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.02719767577946186, "timestamp": "2025-09-30 22:09:16.246103", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.280828", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.02752767875790596, "timestamp": "2025-09-30 22:09:16.283641", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:16.325063", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.022269679233431816, "timestamp": "2025-09-30 22:09:16.353743", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.389524", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.021567845717072487, "timestamp": "2025-09-30 22:09:16.393149", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.432841", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.023687588050961494, "timestamp": "2025-09-30 22:09:16.437550", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.473060", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.03083338961005211, "timestamp": "2025-09-30 22:09:16.475688", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.508283", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.027233019471168518, "timestamp": "2025-09-30 22:09:16.535391", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.569498", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.024067306891083717, "timestamp": "2025-09-30 22:09:16.572035", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.609005", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.01529394369572401, "timestamp": "2025-09-30 22:09:16.612011", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.655316", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.01770094782114029, "timestamp": "2025-09-30 22:09:16.658191", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.696119", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.017579296603798866, "timestamp": "2025-09-30 22:09:16.722340", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.754011", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.016920411959290504, "timestamp": "2025-09-30 22:09:16.756733", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:16.790972", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.01924826018512249, "timestamp": "2025-09-30 22:09:16.793910", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.835103", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.020246148109436035, "timestamp": "2025-09-30 22:09:16.838536", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:16.877822", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.026985352858901024, "timestamp": "2025-09-30 22:09:16.902800", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.946383", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.015785491093993187, "timestamp": "2025-09-30 22:09:16.954224", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:16.996404", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.024172216653823853, "timestamp": "2025-09-30 22:09:17.002136", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.050024", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.01940486952662468, "timestamp": "2025-09-30 22:09:17.054239", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.086262", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.023085501044988632, "timestamp": "2025-09-30 22:09:17.116579", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.157370", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.01945091411471367, "timestamp": "2025-09-30 22:09:17.160912", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.199023", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.020526302978396416, "timestamp": "2025-09-30 22:09:17.202637", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.241885", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.017805086448788643, "timestamp": "2025-09-30 22:09:17.244654", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.285187", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.017751798033714294, "timestamp": "2025-09-30 22:09:17.309846", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.349427", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.01985841430723667, "timestamp": "2025-09-30 22:09:17.352096", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.383952", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.018005359917879105, "timestamp": "2025-09-30 22:09:17.387053", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.422484", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.01930832304060459, "timestamp": "2025-09-30 22:09:17.424941", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.456526", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.028776034712791443, "timestamp": "2025-09-30 22:09:17.480383", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.515503", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.030385851860046387, "timestamp": "2025-09-30 22:09:17.518190", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.555704", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.009547047317028046, "timestamp": "2025-09-30 22:09:17.558826", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:17.590456", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.021585334092378616, "timestamp": "2025-09-30 22:09:17.594339", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:17.633081", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.04027863219380379, "timestamp": "2025-09-30 22:09:17.663088", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.706977", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.04367966577410698, "timestamp": "2025-09-30 22:09:17.710272", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:17.754783", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.03343404084444046, "timestamp": "2025-09-30 22:09:17.758998", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.799151", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.009330673143267632, "timestamp": "2025-09-30 22:09:17.806239", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.842615", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.028857093304395676, "timestamp": "2025-09-30 22:09:17.871630", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:17.903221", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.025730669498443604, "timestamp": "2025-09-30 22:09:17.905760", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:17.943633", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.021670332178473473, "timestamp": "2025-09-30 22:09:17.947613", "step": 114, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:18.896142", "step": 114, "epoch": 1 }, { "type": "pplx", "content": 58534139.80761209, "timestamp": "2025-09-30 22:09:18.898883", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:18.927855", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.009823307394981384, "timestamp": "2025-09-30 22:09:18.929993", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:18.967139", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.028453484177589417, "timestamp": "2025-09-30 22:09:18.991551", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.026563", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.023278359323740005, "timestamp": "2025-09-30 22:09:19.030352", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.067552", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.017218248918652534, "timestamp": "2025-09-30 22:09:19.069876", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.104189", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.015407579019665718, "timestamp": "2025-09-30 22:09:19.111364", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.156579", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.02096530608832836, "timestamp": "2025-09-30 22:09:19.181183", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.222936", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.030719464644789696, "timestamp": "2025-09-30 22:09:19.225745", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:19.256834", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.023789221420884132, "timestamp": "2025-09-30 22:09:19.259053", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:19.293113", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.02422069013118744, "timestamp": "2025-09-30 22:09:19.303236", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.352509", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.015484781935811043, "timestamp": "2025-09-30 22:09:19.378591", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.414774", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.021923230960965157, "timestamp": "2025-09-30 22:09:19.418211", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.450426", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.03109300322830677, "timestamp": "2025-09-30 22:09:19.454284", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:19.492377", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.02031245455145836, "timestamp": "2025-09-30 22:09:19.495342", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.527494", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.022137144580483437, "timestamp": "2025-09-30 22:09:19.552142", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.583975", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.020822782069444656, "timestamp": "2025-09-30 22:09:19.586482", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.625574", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.027242343872785568, "timestamp": "2025-09-30 22:09:19.628308", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.662693", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.027840612456202507, "timestamp": "2025-09-30 22:09:19.665395", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.697783", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.020926009863615036, "timestamp": "2025-09-30 22:09:19.725080", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.756602", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.014202489517629147, "timestamp": "2025-09-30 22:09:19.758795", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.789976", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.024952108040452003, "timestamp": "2025-09-30 22:09:19.792420", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:19.826685", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.023683322593569756, "timestamp": "2025-09-30 22:09:19.828977", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:19.863113", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.02011030726134777, "timestamp": "2025-09-30 22:09:19.887275", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:19.928020", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.022830024361610413, "timestamp": "2025-09-30 22:09:19.931512", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:19.963089", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.02508159540593624, "timestamp": "2025-09-30 22:09:19.967458", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.000658", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.016803130507469177, "timestamp": "2025-09-30 22:09:20.004268", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.040017", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.026307707652449608, "timestamp": "2025-09-30 22:09:20.066332", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.103807", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.01853339932858944, "timestamp": "2025-09-30 22:09:20.107421", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.146944", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.01986740343272686, "timestamp": "2025-09-30 22:09:20.149468", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.180455", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.026442429050803185, "timestamp": "2025-09-30 22:09:20.183589", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.224127", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.013860334642231464, "timestamp": "2025-09-30 22:09:20.248424", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.279970", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.01697557047009468, "timestamp": "2025-09-30 22:09:20.285375", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.321143", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.01053065899759531, "timestamp": "2025-09-30 22:09:20.323946", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.357847", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.023069296032190323, "timestamp": "2025-09-30 22:09:20.365149", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.408388", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.031005268916487694, "timestamp": "2025-09-30 22:09:20.432411", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.463781", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.04410288482904434, "timestamp": "2025-09-30 22:09:20.466537", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.500020", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.04395507648587227, "timestamp": "2025-09-30 22:09:20.502770", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.544615", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.017706802114844322, "timestamp": "2025-09-30 22:09:20.557460", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.599912", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.007974297739565372, "timestamp": "2025-09-30 22:09:20.623923", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:20.674167", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.010354185476899147, "timestamp": "2025-09-30 22:09:20.682383", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:20.735253", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.02493741549551487, "timestamp": "2025-09-30 22:09:20.741201", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.782802", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.030498506501317024, "timestamp": "2025-09-30 22:09:20.792049", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.829033", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.020751025527715683, "timestamp": "2025-09-30 22:09:20.855163", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.892618", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.04443346709012985, "timestamp": "2025-09-30 22:09:20.895268", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:20.928220", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.06002501770853996, "timestamp": "2025-09-30 22:09:20.936899", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:20.972521", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.04527200013399124, "timestamp": "2025-09-30 22:09:20.977524", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.009917", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.03547225147485733, "timestamp": "2025-09-30 22:09:21.039356", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.073410", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.02295391820371151, "timestamp": "2025-09-30 22:09:21.082083", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.116796", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.03174513950943947, "timestamp": "2025-09-30 22:09:21.122115", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.167665", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.02424296736717224, "timestamp": "2025-09-30 22:09:21.172311", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:21.219074", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.017598729580640793, "timestamp": "2025-09-30 22:09:21.244379", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.286836", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.02205030620098114, "timestamp": "2025-09-30 22:09:21.289414", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:21.322458", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.027921559289097786, "timestamp": "2025-09-30 22:09:21.325980", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.365573", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.029713410884141922, "timestamp": "2025-09-30 22:09:21.368535", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.405306", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.017205597832798958, "timestamp": "2025-09-30 22:09:21.430538", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.476090", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.02522202394902706, "timestamp": "2025-09-30 22:09:21.479456", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:21.512103", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.029750773683190346, "timestamp": "2025-09-30 22:09:21.520054", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:21.560214", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.02134212851524353, "timestamp": "2025-09-30 22:09:21.567431", "step": 171, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:22.542027", "step": 171, "epoch": 1 }, { "type": "pplx", "content": 63135953.81431602, "timestamp": "2025-09-30 22:09:22.549503", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:22.579778", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.025313351303339005, "timestamp": "2025-09-30 22:09:22.609048", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:22.644211", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.02162635698914528, "timestamp": "2025-09-30 22:09:22.648087", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:22.685698", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.023029791191220284, "timestamp": "2025-09-30 22:09:22.700922", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:22.737162", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.02027418650686741, "timestamp": "2025-09-30 22:09:22.739647", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:22.773725", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.02678442932665348, "timestamp": "2025-09-30 22:09:22.798748", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:22.833449", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.019500982016324997, "timestamp": "2025-09-30 22:09:22.836804", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:22.869460", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.018234090879559517, "timestamp": "2025-09-30 22:09:22.873359", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:22.914959", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.01549634337425232, "timestamp": "2025-09-30 22:09:22.916908", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:22.949079", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.023833613842725754, "timestamp": "2025-09-30 22:09:22.972976", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.011745", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.020760536193847656, "timestamp": "2025-09-30 22:09:23.016768", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.055342", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.02735050581395626, "timestamp": "2025-09-30 22:09:23.057868", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.090773", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.04049814119935036, "timestamp": "2025-09-30 22:09:23.093111", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.125859", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.03238435834646225, "timestamp": "2025-09-30 22:09:23.150246", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:23.184750", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.02755420282483101, "timestamp": "2025-09-30 22:09:23.187523", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.219075", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.021524375304579735, "timestamp": "2025-09-30 22:09:23.221725", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.256297", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.028563236817717552, "timestamp": "2025-09-30 22:09:23.260646", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.298047", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.020461514592170715, "timestamp": "2025-09-30 22:09:23.321944", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.359970", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.017072943970561028, "timestamp": "2025-09-30 22:09:23.362327", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.398504", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.023945823311805725, "timestamp": "2025-09-30 22:09:23.402855", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.440466", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.018736975267529488, "timestamp": "2025-09-30 22:09:23.442991", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:23.482508", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.02092103660106659, "timestamp": "2025-09-30 22:09:23.508844", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.548356", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.011695189401507378, "timestamp": "2025-09-30 22:09:23.561921", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.604456", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.012270850129425526, "timestamp": "2025-09-30 22:09:23.607780", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.641746", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.017633767798542976, "timestamp": "2025-09-30 22:09:23.645103", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.681614", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.02073209546506405, "timestamp": "2025-09-30 22:09:23.706313", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.745909", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.011094124056398869, "timestamp": "2025-09-30 22:09:23.755657", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.788928", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.02200642041862011, "timestamp": "2025-09-30 22:09:23.791820", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:23.828375", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.03361174091696739, "timestamp": "2025-09-30 22:09:23.830747", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.872473", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.032245974987745285, "timestamp": "2025-09-30 22:09:23.897029", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:23.929554", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.017100024968385696, "timestamp": "2025-09-30 22:09:23.935873", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:23.975347", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.019537942484021187, "timestamp": "2025-09-30 22:09:23.977741", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.022466", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.02034500427544117, "timestamp": "2025-09-30 22:09:24.024897", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:24.057311", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.02110188640654087, "timestamp": "2025-09-30 22:09:24.080940", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:24.116185", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.02113529108464718, "timestamp": "2025-09-30 22:09:24.123925", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.170996", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.010029876604676247, "timestamp": "2025-09-30 22:09:24.177332", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.211497", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.030450621619820595, "timestamp": "2025-09-30 22:09:24.220174", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.261166", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.019182788208127022, "timestamp": "2025-09-30 22:09:24.285128", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.318848", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.030275586992502213, "timestamp": "2025-09-30 22:09:24.323077", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.364047", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.03572225570678711, "timestamp": "2025-09-30 22:09:24.377896", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.416921", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.018198251724243164, "timestamp": "2025-09-30 22:09:24.421098", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.457468", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.04005300626158714, "timestamp": "2025-09-30 22:09:24.482710", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.520417", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.029761286452412605, "timestamp": "2025-09-30 22:09:24.523772", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.566134", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.018643589690327644, "timestamp": "2025-09-30 22:09:24.569736", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.626292", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.026240238919854164, "timestamp": "2025-09-30 22:09:24.630332", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.679963", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.0247743409126997, "timestamp": "2025-09-30 22:09:24.705726", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:24.742582", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.02074492536485195, "timestamp": "2025-09-30 22:09:24.748216", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:24.804052", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.024284886196255684, "timestamp": "2025-09-30 22:09:24.815954", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.849931", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.022740570828318596, "timestamp": "2025-09-30 22:09:24.854642", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.894445", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.01697150431573391, "timestamp": "2025-09-30 22:09:24.919994", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:24.960812", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.021439313888549805, "timestamp": "2025-09-30 22:09:24.965122", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.011331", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.023758551105856895, "timestamp": "2025-09-30 22:09:25.014195", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.057607", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.019697774201631546, "timestamp": "2025-09-30 22:09:25.070052", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.119111", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.02828163094818592, "timestamp": "2025-09-30 22:09:25.144906", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.179325", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.01745227910578251, "timestamp": "2025-09-30 22:09:25.182684", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.216373", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.020431501790881157, "timestamp": "2025-09-30 22:09:25.219420", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:25.262506", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.02470734715461731, "timestamp": "2025-09-30 22:09:25.266184", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:25.300680", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.020329369232058525, "timestamp": "2025-09-30 22:09:25.334365", "step": 228, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:26.396318", "step": 228, "epoch": 1 }, { "type": "pplx", "content": 64583602.40184191, "timestamp": "2025-09-30 22:09:26.400761", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:26.436859", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.020061250776052475, "timestamp": "2025-09-30 22:09:26.445307", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.482340", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.023501979187130928, "timestamp": "2025-09-30 22:09:26.486298", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.520038", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.022963250055909157, "timestamp": "2025-09-30 22:09:26.523926", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.557188", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.026661524549126625, "timestamp": "2025-09-30 22:09:26.583351", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:26.617885", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.02811359241604805, "timestamp": "2025-09-30 22:09:26.622318", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.656864", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.020175501704216003, "timestamp": "2025-09-30 22:09:26.660057", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.704221", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.021329758688807487, "timestamp": "2025-09-30 22:09:26.707883", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.744204", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.023740265518426895, "timestamp": "2025-09-30 22:09:26.769331", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:26.802865", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.02140076644718647, "timestamp": "2025-09-30 22:09:26.811659", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.849074", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.024655591696500778, "timestamp": "2025-09-30 22:09:26.853322", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.897500", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.02812766842544079, "timestamp": "2025-09-30 22:09:26.900902", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:26.937171", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.021700652316212654, "timestamp": "2025-09-30 22:09:26.961785", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:26.993967", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.017045430839061737, "timestamp": "2025-09-30 22:09:26.997221", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.029087", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.014452247880399227, "timestamp": "2025-09-30 22:09:27.035776", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.083171", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.01355509739369154, "timestamp": "2025-09-30 22:09:27.086116", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.117351", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.01483946107327938, "timestamp": "2025-09-30 22:09:27.141796", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.184555", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.03674229606986046, "timestamp": "2025-09-30 22:09:27.193271", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.240463", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.03379470854997635, "timestamp": "2025-09-30 22:09:27.249237", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.291190", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.031083837151527405, "timestamp": "2025-09-30 22:09:27.294330", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.327714", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.02411106415092945, "timestamp": "2025-09-30 22:09:27.351673", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.392985", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.03847876191139221, "timestamp": "2025-09-30 22:09:27.397613", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.434230", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.013434567488729954, "timestamp": "2025-09-30 22:09:27.437393", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.470533", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.0336221382021904, "timestamp": "2025-09-30 22:09:27.482598", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.518775", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.011063228361308575, "timestamp": "2025-09-30 22:09:27.545257", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.587336", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.009829501621425152, "timestamp": "2025-09-30 22:09:27.590919", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.625865", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.02949223481118679, "timestamp": "2025-09-30 22:09:27.628946", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.669542", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.01204278226941824, "timestamp": "2025-09-30 22:09:27.673618", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:27.709938", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.016949284821748734, "timestamp": "2025-09-30 22:09:27.735954", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.774681", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.01266380213201046, "timestamp": "2025-09-30 22:09:27.778898", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.812145", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.03958168253302574, "timestamp": "2025-09-30 22:09:27.818913", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.859520", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.02375471219420433, "timestamp": "2025-09-30 22:09:27.862962", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:27.902512", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.022940808907151222, "timestamp": "2025-09-30 22:09:27.926849", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.959007", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.023124128580093384, "timestamp": "2025-09-30 22:09:27.963644", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:27.996992", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.020926760509610176, "timestamp": "2025-09-30 22:09:28.000115", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.033434", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.020220564678311348, "timestamp": "2025-09-30 22:09:28.036473", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.073437", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.011818875558674335, "timestamp": "2025-09-30 22:09:28.097489", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.130951", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.022864850237965584, "timestamp": "2025-09-30 22:09:28.135352", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.169449", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.021252285689115524, "timestamp": "2025-09-30 22:09:28.174293", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:28.214276", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.021793512627482414, "timestamp": "2025-09-30 22:09:28.222872", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.261060", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.03829879313707352, "timestamp": "2025-09-30 22:09:28.285559", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.317907", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.016349567100405693, "timestamp": "2025-09-30 22:09:28.324735", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.356375", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.021319296211004257, "timestamp": "2025-09-30 22:09:28.364055", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.397989", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.023110728710889816, "timestamp": "2025-09-30 22:09:28.405917", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.446429", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.01968984305858612, "timestamp": "2025-09-30 22:09:28.473040", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.505796", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.024730121716856956, "timestamp": "2025-09-30 22:09:28.508918", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:28.549681", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.014643555507063866, "timestamp": "2025-09-30 22:09:28.556887", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.589634", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.019117284566164017, "timestamp": "2025-09-30 22:09:28.592910", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:28.627191", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.027165459468960762, "timestamp": "2025-09-30 22:09:28.655612", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.692616", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.0140123525634408, "timestamp": "2025-09-30 22:09:28.695435", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.730800", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.02771705947816372, "timestamp": "2025-09-30 22:09:28.734046", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:28.767972", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.022224754095077515, "timestamp": "2025-09-30 22:09:28.771316", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:28.808793", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.04482335224747658, "timestamp": "2025-09-30 22:09:28.835645", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.871446", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.017204681411385536, "timestamp": "2025-09-30 22:09:28.876235", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.912127", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.017169682309031487, "timestamp": "2025-09-30 22:09:28.915984", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:28.956136", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.022637654095888138, "timestamp": "2025-09-30 22:09:28.960484", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:28.994509", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.03536595404148102, "timestamp": "2025-09-30 22:09:29.026264", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:29.071959", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.024568825960159302, "timestamp": "2025-09-30 22:09:29.074404", "step": 285, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:30.063079", "step": 285, "epoch": 1 }, { "type": "pplx", "content": 67541395.63657059, "timestamp": "2025-09-30 22:09:30.068836", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.098905", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.014133117161691189, "timestamp": "2025-09-30 22:09:30.101945", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:30.133810", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.019359812140464783, "timestamp": "2025-09-30 22:09:30.136460", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:30.173794", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.02667580358684063, "timestamp": "2025-09-30 22:09:30.198910", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.231598", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.026296624913811684, "timestamp": "2025-09-30 22:09:30.236977", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.274855", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.016411451622843742, "timestamp": "2025-09-30 22:09:30.278524", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.314050", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.020339542999863625, "timestamp": "2025-09-30 22:09:30.319097", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:30.352661", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.02564086951315403, "timestamp": "2025-09-30 22:09:30.381112", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.422462", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.015429101884365082, "timestamp": "2025-09-30 22:09:30.425828", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.461677", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.029271453619003296, "timestamp": "2025-09-30 22:09:30.469204", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.505070", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.020868074148893356, "timestamp": "2025-09-30 22:09:30.507786", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:30.542341", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.014817570336163044, "timestamp": "2025-09-30 22:09:30.569318", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.613647", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.01824035681784153, "timestamp": "2025-09-30 22:09:30.616991", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.649677", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.02374071255326271, "timestamp": "2025-09-30 22:09:30.652636", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:30.698100", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.01639840006828308, "timestamp": "2025-09-30 22:09:30.701630", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:30.748179", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.018694445490837097, "timestamp": "2025-09-30 22:09:30.773482", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.818274", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.024248849600553513, "timestamp": "2025-09-30 22:09:30.824282", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:30.865970", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.01262893807142973, "timestamp": "2025-09-30 22:09:30.868640", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.903772", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.011411392129957676, "timestamp": "2025-09-30 22:09:30.910795", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:30.957868", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.015114856883883476, "timestamp": "2025-09-30 22:09:30.983434", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.020505", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.020919784903526306, "timestamp": "2025-09-30 22:09:31.023106", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.058977", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.030423318967223167, "timestamp": "2025-09-30 22:09:31.062077", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.094352", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.029984524473547935, "timestamp": "2025-09-30 22:09:31.097811", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.129591", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.018007751554250717, "timestamp": "2025-09-30 22:09:31.158882", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.191676", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.005845424719154835, "timestamp": "2025-09-30 22:09:31.194782", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.229860", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.033217113465070724, "timestamp": "2025-09-30 22:09:31.238443", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.272512", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.03235980495810509, "timestamp": "2025-09-30 22:09:31.275483", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.309284", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.019056664779782295, "timestamp": "2025-09-30 22:09:31.333963", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.378766", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.02388598583638668, "timestamp": "2025-09-30 22:09:31.393057", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.431437", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.015021202154457569, "timestamp": "2025-09-30 22:09:31.436657", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.472069", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.03160003945231438, "timestamp": "2025-09-30 22:09:31.477990", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.513901", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.006040097679942846, "timestamp": "2025-09-30 22:09:31.540087", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.576512", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.010645993985235691, "timestamp": "2025-09-30 22:09:31.581099", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:31.617568", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.028969671577215195, "timestamp": "2025-09-30 22:09:31.621272", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.656645", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.015102268196642399, "timestamp": "2025-09-30 22:09:31.662613", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:31.698770", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.020058704540133476, "timestamp": "2025-09-30 22:09:31.723844", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:31.769420", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.01846270076930523, "timestamp": "2025-09-30 22:09:31.774034", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.822909", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.035393718630075455, "timestamp": "2025-09-30 22:09:31.828845", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:31.867154", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.0352255143225193, "timestamp": "2025-09-30 22:09:31.877932", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:31.925281", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.025907455012202263, "timestamp": "2025-09-30 22:09:31.950942", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:31.995930", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.02590997889637947, "timestamp": "2025-09-30 22:09:32.000413", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.047117", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.02406112290918827, "timestamp": "2025-09-30 22:09:32.050366", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.086862", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.026598971337080002, "timestamp": "2025-09-30 22:09:32.091614", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:32.127983", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.024215033277869225, "timestamp": "2025-09-30 22:09:32.152703", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:32.205176", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.01977970078587532, "timestamp": "2025-09-30 22:09:32.216556", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.281426", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.02125716581940651, "timestamp": "2025-09-30 22:09:32.289404", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:32.333052", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.020354611799120903, "timestamp": "2025-09-30 22:09:32.337434", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.378729", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.033674128353595734, "timestamp": "2025-09-30 22:09:32.410082", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:32.450701", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.030016472563147545, "timestamp": "2025-09-30 22:09:32.462638", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.510816", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.025528879836201668, "timestamp": "2025-09-30 22:09:32.514079", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.562892", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.022933050990104675, "timestamp": "2025-09-30 22:09:32.574319", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:32.610613", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.02693336084485054, "timestamp": "2025-09-30 22:09:32.636625", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:32.677899", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.0301218144595623, "timestamp": "2025-09-30 22:09:32.681008", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:32.722275", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.02498083934187889, "timestamp": "2025-09-30 22:09:32.729951", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:32.771925", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.020787309855222702, "timestamp": "2025-09-30 22:09:32.775976", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:32.812922", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.020002322271466255, "timestamp": "2025-09-30 22:09:32.839529", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:32.903630", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.023302387446165085, "timestamp": "2025-09-30 22:09:32.907785", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:32.945673", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.030038101598620415, "timestamp": "2025-09-30 22:09:32.953338", "step": 342, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:34.031111", "step": 342, "epoch": 1 }, { "type": "pplx", "content": 69919293.60206336, "timestamp": "2025-09-30 22:09:34.042131", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.080251", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.02141713909804821, "timestamp": "2025-09-30 22:09:34.083537", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.128037", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.04290292412042618, "timestamp": "2025-09-30 22:09:34.153547", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.187199", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.023067476227879524, "timestamp": "2025-09-30 22:09:34.191669", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:34.226177", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.030794357880949974, "timestamp": "2025-09-30 22:09:34.229473", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:34.279983", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.021632444113492966, "timestamp": "2025-09-30 22:09:34.284325", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.334928", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.024647142738103867, "timestamp": "2025-09-30 22:09:34.361104", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:34.398676", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.01926472969353199, "timestamp": "2025-09-30 22:09:34.401633", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.434687", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.029012974351644516, "timestamp": "2025-09-30 22:09:34.438084", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.475765", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.0349954217672348, "timestamp": "2025-09-30 22:09:34.478355", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.513407", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.024386536329984665, "timestamp": "2025-09-30 22:09:34.543507", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.583125", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.037951208651065826, "timestamp": "2025-09-30 22:09:34.588211", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:34.637639", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.028125781565904617, "timestamp": "2025-09-30 22:09:34.641137", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.683930", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.029399218037724495, "timestamp": "2025-09-30 22:09:34.687425", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.728408", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.012775695882737637, "timestamp": "2025-09-30 22:09:34.753795", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:34.804617", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.017647113651037216, "timestamp": "2025-09-30 22:09:34.808689", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.853194", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.01233199518173933, "timestamp": "2025-09-30 22:09:34.864538", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:34.907648", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.009669276885688305, "timestamp": "2025-09-30 22:09:34.911690", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:34.946291", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.019029730930924416, "timestamp": "2025-09-30 22:09:34.970626", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:35.006672", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.02731066755950451, "timestamp": "2025-09-30 22:09:35.009767", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.050270", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.032158952206373215, "timestamp": "2025-09-30 22:09:35.054516", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:35.102949", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.022303972393274307, "timestamp": "2025-09-30 22:09:35.108215", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.148425", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.010140285827219486, "timestamp": "2025-09-30 22:09:35.173555", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.212952", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.02385084703564644, "timestamp": "2025-09-30 22:09:35.225443", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:35.271733", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.013454384170472622, "timestamp": "2025-09-30 22:09:35.277010", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:35.312917", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.017787154763936996, "timestamp": "2025-09-30 22:09:35.323639", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.360561", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.02700085937976837, "timestamp": "2025-09-30 22:09:35.385136", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.421380", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.02195524238049984, "timestamp": "2025-09-30 22:09:35.430490", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:35.468443", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.0264048483222723, "timestamp": "2025-09-30 22:09:35.472904", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.511783", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.012875868938863277, "timestamp": "2025-09-30 22:09:35.516881", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.556326", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.03059910610318184, "timestamp": "2025-09-30 22:09:35.581083", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:35.623872", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.026967348530888557, "timestamp": "2025-09-30 22:09:35.627985", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.664828", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.025812586769461632, "timestamp": "2025-09-30 22:09:35.677022", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.715645", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.017953310161828995, "timestamp": "2025-09-30 22:09:35.718931", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.779443", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.01472870446741581, "timestamp": "2025-09-30 22:09:35.804726", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.839732", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.015016913414001465, "timestamp": "2025-09-30 22:09:35.844058", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.888207", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.02110576257109642, "timestamp": "2025-09-30 22:09:35.892136", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:35.943435", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.020046228542923927, "timestamp": "2025-09-30 22:09:35.948441", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:35.991476", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.020357858389616013, "timestamp": "2025-09-30 22:09:36.017437", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.052483", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.02067236602306366, "timestamp": "2025-09-30 22:09:36.056689", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:36.096194", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.021537670865654945, "timestamp": "2025-09-30 22:09:36.100337", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.141962", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.017676519230008125, "timestamp": "2025-09-30 22:09:36.145376", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:36.179449", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.02317376248538494, "timestamp": "2025-09-30 22:09:36.204712", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:36.245389", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.016081545501947403, "timestamp": "2025-09-30 22:09:36.261000", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.296757", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.01612349972128868, "timestamp": "2025-09-30 22:09:36.309147", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.352038", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.01686694473028183, "timestamp": "2025-09-30 22:09:36.359009", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.398587", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.023269539698958397, "timestamp": "2025-09-30 22:09:36.423721", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:36.462294", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.02148287370800972, "timestamp": "2025-09-30 22:09:36.466876", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.507808", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.02776309847831726, "timestamp": "2025-09-30 22:09:36.511686", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.548916", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.029952269047498703, "timestamp": "2025-09-30 22:09:36.553771", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.588283", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.015080233104526997, "timestamp": "2025-09-30 22:09:36.613242", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.647960", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.01509413868188858, "timestamp": "2025-09-30 22:09:36.651767", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:36.685129", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.02030949667096138, "timestamp": "2025-09-30 22:09:36.688359", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.737919", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.010384783148765564, "timestamp": "2025-09-30 22:09:36.745003", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.782177", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.01926104538142681, "timestamp": "2025-09-30 22:09:36.806865", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.846794", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.007770257536321878, "timestamp": "2025-09-30 22:09:36.852637", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:36.903386", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.012745345011353493, "timestamp": "2025-09-30 22:09:36.907521", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:36.945406", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.020830152556300163, "timestamp": "2025-09-30 22:09:36.952080", "step": 399, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:37.985435", "step": 399, "epoch": 1 }, { "type": "pplx", "content": 69766047.56409389, "timestamp": "2025-09-30 22:09:37.989273", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.023315", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.015908202156424522, "timestamp": "2025-09-30 22:09:38.049697", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.094255", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.00617224583402276, "timestamp": "2025-09-30 22:09:38.097537", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:38.134714", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.02191239409148693, "timestamp": "2025-09-30 22:09:38.138873", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:38.174380", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.020212730392813683, "timestamp": "2025-09-30 22:09:38.180307", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.218317", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.035332221537828445, "timestamp": "2025-09-30 22:09:38.250151", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:38.291319", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.028063569217920303, "timestamp": "2025-09-30 22:09:38.303187", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:38.339867", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.037404220551252365, "timestamp": "2025-09-30 22:09:38.344974", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.384853", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.009203077293932438, "timestamp": "2025-09-30 22:09:38.390086", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:38.429683", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.006914178375154734, "timestamp": "2025-09-30 22:09:38.455627", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.492632", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.01954054646193981, "timestamp": "2025-09-30 22:09:38.495612", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.529636", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.021575123071670532, "timestamp": "2025-09-30 22:09:38.532507", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:38.574228", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.01723899319767952, "timestamp": "2025-09-30 22:09:38.577214", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:38.620060", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.038678593933582306, "timestamp": "2025-09-30 22:09:38.650713", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.683786", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.013338501565158367, "timestamp": "2025-09-30 22:09:38.687134", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.724089", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.03637400269508362, "timestamp": "2025-09-30 22:09:38.736940", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.771853", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.04411586374044418, "timestamp": "2025-09-30 22:09:38.786289", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.819623", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.00751527538523078, "timestamp": "2025-09-30 22:09:38.844258", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.882058", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.016765791922807693, "timestamp": "2025-09-30 22:09:38.894642", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.932004", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.028791921213269234, "timestamp": "2025-09-30 22:09:38.936323", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:38.976353", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.022568225860595703, "timestamp": "2025-09-30 22:09:38.979880", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:39.013998", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.014296877197921276, "timestamp": "2025-09-30 22:09:39.044562", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.081846", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.012356564402580261, "timestamp": "2025-09-30 22:09:39.086430", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:39.130132", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.010573080740869045, "timestamp": "2025-09-30 22:09:39.135029", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:39.176056", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.013825618661940098, "timestamp": "2025-09-30 22:09:39.179347", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:39.212189", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.015153718180954456, "timestamp": "2025-09-30 22:09:39.236550", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:39.270159", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.019143089652061462, "timestamp": "2025-09-30 22:09:39.274183", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:39.306859", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.025414396077394485, "timestamp": "2025-09-30 22:09:39.309563", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.344109", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.018626336008310318, "timestamp": "2025-09-30 22:09:39.346665", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.386968", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.013591468334197998, "timestamp": "2025-09-30 22:09:39.411056", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.443796", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.02437545731663704, "timestamp": "2025-09-30 22:09:39.450852", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:39.485737", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.020426731556653976, "timestamp": "2025-09-30 22:09:39.489024", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.528782", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.01356650609523058, "timestamp": "2025-09-30 22:09:39.536063", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:39.580730", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.019357116892933846, "timestamp": "2025-09-30 22:09:39.606081", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.647934", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.014607379212975502, "timestamp": "2025-09-30 22:09:39.657392", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:39.705398", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.02458411268889904, "timestamp": "2025-09-30 22:09:39.716036", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.753100", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.01863814704120159, "timestamp": "2025-09-30 22:09:39.761228", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.794958", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.015778204426169395, "timestamp": "2025-09-30 22:09:39.820506", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.858954", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.026689749211072922, "timestamp": "2025-09-30 22:09:39.861771", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.903709", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.028620868921279907, "timestamp": "2025-09-30 22:09:39.912094", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.950297", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.038862477988004684, "timestamp": "2025-09-30 22:09:39.953880", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:39.991357", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.01892365887761116, "timestamp": "2025-09-30 22:09:40.016499", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.058678", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.02564559504389763, "timestamp": "2025-09-30 22:09:40.068642", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.109996", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.02982628531754017, "timestamp": "2025-09-30 22:09:40.114870", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:40.155375", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.012708432041108608, "timestamp": "2025-09-30 22:09:40.167290", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:40.227142", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.03156870976090431, "timestamp": "2025-09-30 22:09:40.262332", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.307457", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.012731562368571758, "timestamp": "2025-09-30 22:09:40.320228", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.373129", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.019928231835365295, "timestamp": "2025-09-30 22:09:40.377514", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:40.424312", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.010161194019019604, "timestamp": "2025-09-30 22:09:40.438210", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.484577", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.017352495342493057, "timestamp": "2025-09-30 22:09:40.521010", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:40.564598", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.009393865242600441, "timestamp": "2025-09-30 22:09:40.569244", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.612354", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.02728438377380371, "timestamp": "2025-09-30 22:09:40.615217", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.655775", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.020611116662621498, "timestamp": "2025-09-30 22:09:40.659271", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.693792", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.009651792235672474, "timestamp": "2025-09-30 22:09:40.723988", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.776475", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.016400078311562538, "timestamp": "2025-09-30 22:09:40.788329", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:40.840089", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.018265044316649437, "timestamp": "2025-09-30 22:09:40.843522", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:40.885816", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.02419288642704487, "timestamp": "2025-09-30 22:09:40.889386", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:40.927807", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.013079372234642506, "timestamp": "2025-09-30 22:09:40.960208", "step": 456, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:41.993404", "step": 456, "epoch": 1 }, { "type": "pplx", "content": 71624224.58457558, "timestamp": "2025-09-30 22:09:41.996470", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.026109", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.022408118471503258, "timestamp": "2025-09-30 22:09:42.028771", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:42.065959", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.03150170296430588, "timestamp": "2025-09-30 22:09:42.068419", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.103334", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.010739601217210293, "timestamp": "2025-09-30 22:09:42.108529", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:42.140776", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.03207072988152504, "timestamp": "2025-09-30 22:09:42.165701", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.197356", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.017702434211969376, "timestamp": "2025-09-30 22:09:42.200545", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.233174", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.009795568883419037, "timestamp": "2025-09-30 22:09:42.237262", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.276405", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.008488679304718971, "timestamp": "2025-09-30 22:09:42.285896", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.318785", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.021367086097598076, "timestamp": "2025-09-30 22:09:42.343436", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.376115", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.01849384978413582, "timestamp": "2025-09-30 22:09:42.379287", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.419389", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.015463622286915779, "timestamp": "2025-09-30 22:09:42.425901", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.463041", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.018109837546944618, "timestamp": "2025-09-30 22:09:42.466051", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.498020", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.027336323633790016, "timestamp": "2025-09-30 22:09:42.524118", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.556592", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.023516599088907242, "timestamp": "2025-09-30 22:09:42.559441", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.602347", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.03789094090461731, "timestamp": "2025-09-30 22:09:42.606093", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.644859", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.006722516845911741, "timestamp": "2025-09-30 22:09:42.647902", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.681530", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.018469030037522316, "timestamp": "2025-09-30 22:09:42.707026", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:42.746588", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.019120389595627785, "timestamp": "2025-09-30 22:09:42.749953", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:42.784143", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.013059214688837528, "timestamp": "2025-09-30 22:09:42.788133", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:42.822558", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.011780438013374805, "timestamp": "2025-09-30 22:09:42.825250", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.869895", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.01069071888923645, "timestamp": "2025-09-30 22:09:42.894335", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.935117", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.012630014680325985, "timestamp": "2025-09-30 22:09:42.941017", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:42.973933", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.014345736242830753, "timestamp": "2025-09-30 22:09:42.977658", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.016170", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.013579734601080418, "timestamp": "2025-09-30 22:09:43.022549", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.056113", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.005495802033692598, "timestamp": "2025-09-30 22:09:43.080369", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.112060", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.024428632110357285, "timestamp": "2025-09-30 22:09:43.115005", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.157534", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.027178974822163582, "timestamp": "2025-09-30 22:09:43.160113", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:43.193019", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.011072909459471703, "timestamp": "2025-09-30 22:09:43.199496", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.230455", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.015952181071043015, "timestamp": "2025-09-30 22:09:43.255671", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:43.299126", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.01662355288863182, "timestamp": "2025-09-30 22:09:43.301958", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.341828", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.013828770257532597, "timestamp": "2025-09-30 22:09:43.344889", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:43.380170", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.024804027751088142, "timestamp": "2025-09-30 22:09:43.383012", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.417512", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.024769103154540062, "timestamp": "2025-09-30 22:09:43.442612", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.476936", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.014705442823469639, "timestamp": "2025-09-30 22:09:43.485386", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:43.520192", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.012051516212522984, "timestamp": "2025-09-30 22:09:43.524132", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.559375", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.022848201915621758, "timestamp": "2025-09-30 22:09:43.562399", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:43.596133", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.008396407589316368, "timestamp": "2025-09-30 22:09:43.620436", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.654644", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.006951562594622374, "timestamp": "2025-09-30 22:09:43.658010", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.697030", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.03694300726056099, "timestamp": "2025-09-30 22:09:43.700477", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.742657", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.011894827708601952, "timestamp": "2025-09-30 22:09:43.750525", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.788152", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.018705202266573906, "timestamp": "2025-09-30 22:09:43.822061", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:43.862987", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.008354590274393559, "timestamp": "2025-09-30 22:09:43.867623", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:43.909660", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.012969491071999073, "timestamp": "2025-09-30 22:09:43.912524", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.947799", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.021853569895029068, "timestamp": "2025-09-30 22:09:43.950860", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:43.987077", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.0062960060313344, "timestamp": "2025-09-30 22:09:44.011190", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-09-30 22:09:50.738730", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:50.780347", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.005837480071932077, "timestamp": "2025-09-30 22:09:50.784736", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:50.818683", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.018935924395918846, "timestamp": "2025-09-30 22:09:50.822618", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:50.858601", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.014775346033275127, "timestamp": "2025-09-30 22:09:50.862412", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:50.897092", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.015353151597082615, "timestamp": "2025-09-30 22:09:50.921935", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:50.970503", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.0022692049387842417, "timestamp": "2025-09-30 22:09:50.973217", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:51.006814", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.060447730123996735, "timestamp": "2025-09-30 22:09:51.010466", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:51.045170", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.030348746106028557, "timestamp": "2025-09-30 22:09:51.048114", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:51.092241", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.037154439836740494, "timestamp": "2025-09-30 22:09:51.118561", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:51.161099", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.0052850088104605675, "timestamp": "2025-09-30 22:09:51.177958", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:51.231146", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.00752244982868433, "timestamp": "2025-09-30 22:09:51.235644", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:51.297841", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.036073219031095505, "timestamp": "2025-09-30 22:09:51.302005", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:51.346641", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.003323480486869812, "timestamp": "2025-09-30 22:09:51.371866", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:51.415960", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.0270464438945055, "timestamp": "2025-09-30 22:09:51.430159", "step": 513, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:52.498120", "step": 513, "epoch": 1 }, { "type": "pplx", "content": 69181075.1968092, "timestamp": "2025-09-30 22:09:52.501210", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.531996", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.003395517822355032, "timestamp": "2025-09-30 22:09:52.542774", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.578982", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.025082603096961975, "timestamp": "2025-09-30 22:09:52.583259", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:52.620589", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.018759986385703087, "timestamp": "2025-09-30 22:09:52.653747", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:52.693029", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.032633136957883835, "timestamp": "2025-09-30 22:09:52.696843", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.732376", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.02764914371073246, "timestamp": "2025-09-30 22:09:52.736492", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.771273", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.01991136558353901, "timestamp": "2025-09-30 22:09:52.776288", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:52.810695", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.00627879286184907, "timestamp": "2025-09-30 22:09:52.834895", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:52.876942", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.022504499182105064, "timestamp": "2025-09-30 22:09:52.879858", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.931814", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.020745843648910522, "timestamp": "2025-09-30 22:09:52.935751", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:52.977983", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.010214862413704395, "timestamp": "2025-09-30 22:09:52.981858", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.018030", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.00609058141708374, "timestamp": "2025-09-30 22:09:53.043069", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.082381", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.021598542109131813, "timestamp": "2025-09-30 22:09:53.085908", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.133503", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.03276056423783302, "timestamp": "2025-09-30 22:09:53.136848", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.178781", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.015305069275200367, "timestamp": "2025-09-30 22:09:53.182574", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.214935", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.013995721936225891, "timestamp": "2025-09-30 22:09:53.239870", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.273616", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.008622610941529274, "timestamp": "2025-09-30 22:09:53.277669", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.318583", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.007376163732260466, "timestamp": "2025-09-30 22:09:53.329815", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.370223", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.02018904872238636, "timestamp": "2025-09-30 22:09:53.374649", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.413421", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.015136617235839367, "timestamp": "2025-09-30 22:09:53.437519", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.472547", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.02124469354748726, "timestamp": "2025-09-30 22:09:53.476684", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.514396", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.03296453133225441, "timestamp": "2025-09-30 22:09:53.518042", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.560333", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.018277723342180252, "timestamp": "2025-09-30 22:09:53.564731", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.602170", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.006135314702987671, "timestamp": "2025-09-30 22:09:53.634980", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.688603", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.032480981200933456, "timestamp": "2025-09-30 22:09:53.691849", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.725993", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.013781941495835781, "timestamp": "2025-09-30 22:09:53.729468", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.763671", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.060867954045534134, "timestamp": "2025-09-30 22:09:53.768282", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:53.812288", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.019219841808080673, "timestamp": "2025-09-30 22:09:53.837721", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.883827", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.04094391316175461, "timestamp": "2025-09-30 22:09:53.898395", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:53.947781", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.01698930561542511, "timestamp": "2025-09-30 22:09:53.954204", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.018268", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.005156234372407198, "timestamp": "2025-09-30 22:09:54.022377", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.083498", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.025457771494984627, "timestamp": "2025-09-30 22:09:54.119462", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.164984", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.026326140388846397, "timestamp": "2025-09-30 22:09:54.169457", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.204557", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.03105790540575981, "timestamp": "2025-09-30 22:09:54.208175", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.249537", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.03859322890639305, "timestamp": "2025-09-30 22:09:54.263448", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:54.306427", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.02075488492846489, "timestamp": "2025-09-30 22:09:54.331639", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.372706", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.0369565524160862, "timestamp": "2025-09-30 22:09:54.384731", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.439782", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.026859017089009285, "timestamp": "2025-09-30 22:09:54.443146", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.477483", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.027645906433463097, "timestamp": "2025-09-30 22:09:54.489100", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.532929", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.035172879695892334, "timestamp": "2025-09-30 22:09:54.557550", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.591364", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.03344567492604256, "timestamp": "2025-09-30 22:09:54.594414", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.635780", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.025778179988265038, "timestamp": "2025-09-30 22:09:54.638541", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.672505", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.00893770344555378, "timestamp": "2025-09-30 22:09:54.675913", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.720654", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.02957429364323616, "timestamp": "2025-09-30 22:09:54.744864", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:54.781349", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.028988074511289597, "timestamp": "2025-09-30 22:09:54.787979", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.830351", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.017317553982138634, "timestamp": "2025-09-30 22:09:54.833830", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.866807", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.029713379219174385, "timestamp": "2025-09-30 22:09:54.874652", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:54.916014", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.010365837253630161, "timestamp": "2025-09-30 22:09:54.940306", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:54.980306", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.021086541935801506, "timestamp": "2025-09-30 22:09:54.983397", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.026773", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.01718318462371826, "timestamp": "2025-09-30 22:09:55.034896", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.069910", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.01298757828772068, "timestamp": "2025-09-30 22:09:55.073087", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.118024", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.0263203177601099, "timestamp": "2025-09-30 22:09:55.149975", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:55.182194", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.024742284789681435, "timestamp": "2025-09-30 22:09:55.185155", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.220844", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.021866092458367348, "timestamp": "2025-09-30 22:09:55.228652", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.270924", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.013656304217875004, "timestamp": "2025-09-30 22:09:55.273579", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:55.306996", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.021317383274435997, "timestamp": "2025-09-30 22:09:55.335191", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.375128", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.02845599129796028, "timestamp": "2025-09-30 22:09:55.380309", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:55.416485", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.014579109847545624, "timestamp": "2025-09-30 22:09:55.419575", "step": 570, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:56.359081", "step": 570, "epoch": 1 }, { "type": "pplx", "content": 77694531.26741207, "timestamp": "2025-09-30 22:09:56.362304", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.397136", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.014638873748481274, "timestamp": "2025-09-30 22:09:56.405918", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.446234", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.016072118654847145, "timestamp": "2025-09-30 22:09:56.470593", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:56.503615", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.026216475293040276, "timestamp": "2025-09-30 22:09:56.507736", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.541132", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.016313405707478523, "timestamp": "2025-09-30 22:09:56.543511", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.575571", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.017599981278181076, "timestamp": "2025-09-30 22:09:56.577989", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.610692", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.012232857756316662, "timestamp": "2025-09-30 22:09:56.641116", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:56.673337", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.021130342036485672, "timestamp": "2025-09-30 22:09:56.679142", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:09:56.710451", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.02429911494255066, "timestamp": "2025-09-30 22:09:56.713187", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.746106", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.01091067399829626, "timestamp": "2025-09-30 22:09:56.750525", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.781104", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.00966098252683878, "timestamp": "2025-09-30 22:09:56.805207", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:56.836940", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.015667086467146873, "timestamp": "2025-09-30 22:09:56.841298", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:56.883250", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.007937231101095676, "timestamp": "2025-09-30 22:09:56.885852", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:56.917925", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.022754153236746788, "timestamp": "2025-09-30 22:09:56.922117", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:56.962312", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.013802661560475826, "timestamp": "2025-09-30 22:09:56.986630", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.019297", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.01588551141321659, "timestamp": "2025-09-30 22:09:57.022169", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.053100", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.007105494383722544, "timestamp": "2025-09-30 22:09:57.061645", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.096839", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.00457183551043272, "timestamp": "2025-09-30 22:09:57.099156", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:57.132381", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.036326173692941666, "timestamp": "2025-09-30 22:09:57.159837", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.201453", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.031208250671625137, "timestamp": "2025-09-30 22:09:57.209571", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.253806", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.026373323053121567, "timestamp": "2025-09-30 22:09:57.261750", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:09:57.311706", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.029552871361374855, "timestamp": "2025-09-30 22:09:57.314904", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.348917", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.015717869624495506, "timestamp": "2025-09-30 22:09:57.372907", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.413967", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.0234203040599823, "timestamp": "2025-09-30 22:09:57.416505", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:57.449886", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.006487022154033184, "timestamp": "2025-09-30 22:09:57.452631", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.488007", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.018183846026659012, "timestamp": "2025-09-30 22:09:57.497039", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.540000", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.03163054212927818, "timestamp": "2025-09-30 22:09:57.564703", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:57.602692", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.01731395348906517, "timestamp": "2025-09-30 22:09:57.605511", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.642712", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.016153180971741676, "timestamp": "2025-09-30 22:09:57.645756", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:57.684776", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.03578946739435196, "timestamp": "2025-09-30 22:09:57.687752", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.719187", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.018138926476240158, "timestamp": "2025-09-30 22:09:57.755394", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.786383", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.0256302822381258, "timestamp": "2025-09-30 22:09:57.788419", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.819218", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.019077766686677933, "timestamp": "2025-09-30 22:09:57.821834", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.859076", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.015015407465398312, "timestamp": "2025-09-30 22:09:57.861984", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.894027", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.022305699065327644, "timestamp": "2025-09-30 22:09:57.919835", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:57.953265", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.017405861988663673, "timestamp": "2025-09-30 22:09:57.956514", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:57.989323", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.02004510723054409, "timestamp": "2025-09-30 22:09:57.991412", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.022784", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.022168070077896118, "timestamp": "2025-09-30 22:09:58.025373", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:09:58.056916", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.015638450160622597, "timestamp": "2025-09-30 22:09:58.081625", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.113926", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.017927611246705055, "timestamp": "2025-09-30 22:09:58.130234", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:58.171678", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.01894911751151085, "timestamp": "2025-09-30 22:09:58.174991", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.215274", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.02456963248550892, "timestamp": "2025-09-30 22:09:58.217971", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.250551", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.03773067519068718, "timestamp": "2025-09-30 22:09:58.275222", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.310140", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.030875859782099724, "timestamp": "2025-09-30 22:09:58.312482", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.345460", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.012332675978541374, "timestamp": "2025-09-30 22:09:58.351572", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.384197", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.03169454261660576, "timestamp": "2025-09-30 22:09:58.387041", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.419955", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.025642435997724533, "timestamp": "2025-09-30 22:09:58.444713", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:58.480313", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.03705728054046631, "timestamp": "2025-09-30 22:09:58.483990", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.515874", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.01200761180371046, "timestamp": "2025-09-30 22:09:58.519461", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.550113", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.025091679766774178, "timestamp": "2025-09-30 22:09:58.552635", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.583437", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.014052960090339184, "timestamp": "2025-09-30 22:09:58.606813", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:58.644661", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.013598896563053131, "timestamp": "2025-09-30 22:09:58.646757", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:58.678094", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.015057659707963467, "timestamp": "2025-09-30 22:09:58.680288", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.711648", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.013657798990607262, "timestamp": "2025-09-30 22:09:58.714492", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.747139", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.04932836815714836, "timestamp": "2025-09-30 22:09:58.774265", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.814607", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.021991120651364326, "timestamp": "2025-09-30 22:09:58.817022", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.852026", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.01033029519021511, "timestamp": "2025-09-30 22:09:58.865897", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:58.899075", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.004971671849489212, "timestamp": "2025-09-30 22:09:58.901055", "step": 627, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:09:59.686406", "step": 627, "epoch": 1 }, { "type": "pplx", "content": 75762044.56936751, "timestamp": "2025-09-30 22:09:59.689148", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.717607", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.005967786069959402, "timestamp": "2025-09-30 22:09:59.742173", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.773138", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.04627048969268799, "timestamp": "2025-09-30 22:09:59.775314", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.805781", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.03623666614294052, "timestamp": "2025-09-30 22:09:59.807782", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.837806", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.016331830993294716, "timestamp": "2025-09-30 22:09:59.840250", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:59.872032", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.015092739835381508, "timestamp": "2025-09-30 22:09:59.895811", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.925652", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.004001949448138475, "timestamp": "2025-09-30 22:09:59.927516", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:09:59.958001", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.008666311390697956, "timestamp": "2025-09-30 22:09:59.960060", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:09:59.990041", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.030506502836942673, "timestamp": "2025-09-30 22:09:59.992155", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.022023", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.006849688943475485, "timestamp": "2025-09-30 22:10:00.045475", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.076045", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.002905628876760602, "timestamp": "2025-09-30 22:10:00.078323", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.112261", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.0353107824921608, "timestamp": "2025-09-30 22:10:00.114919", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.145532", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.03190189599990845, "timestamp": "2025-09-30 22:10:00.147962", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.181810", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.002591110300272703, "timestamp": "2025-09-30 22:10:00.209648", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.241939", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.010863580740988255, "timestamp": "2025-09-30 22:10:00.245521", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.285889", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.01622161455452442, "timestamp": "2025-09-30 22:10:00.288459", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.324166", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.008066744543612003, "timestamp": "2025-09-30 22:10:00.329827", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.362746", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.03653540462255478, "timestamp": "2025-09-30 22:10:00.392762", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.424716", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.008595902472734451, "timestamp": "2025-09-30 22:10:00.427010", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.458102", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.03257131204009056, "timestamp": "2025-09-30 22:10:00.460636", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.492091", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.007714335806667805, "timestamp": "2025-09-30 22:10:00.494391", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.529366", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.00786947924643755, "timestamp": "2025-09-30 22:10:00.555784", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:00.593070", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.009578948840498924, "timestamp": "2025-09-30 22:10:00.596750", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:00.631079", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.007144716568291187, "timestamp": "2025-09-30 22:10:00.635260", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.668063", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.01738644763827324, "timestamp": "2025-09-30 22:10:00.670564", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.700774", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.013808409683406353, "timestamp": "2025-09-30 22:10:00.725001", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.755689", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.02000904455780983, "timestamp": "2025-09-30 22:10:00.762451", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.799705", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.023145198822021484, "timestamp": "2025-09-30 22:10:00.813065", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.844954", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.006938356440514326, "timestamp": "2025-09-30 22:10:00.847578", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.879227", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.006100603379309177, "timestamp": "2025-09-30 22:10:00.902911", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:00.935058", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.004185267724096775, "timestamp": "2025-09-30 22:10:00.937749", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:00.971195", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.01654222048819065, "timestamp": "2025-09-30 22:10:00.974159", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:01.007289", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.0077790203504264355, "timestamp": "2025-09-30 22:10:01.009528", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.053381", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.011989668942987919, "timestamp": "2025-09-30 22:10:01.077026", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.107400", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.010614539496600628, "timestamp": "2025-09-30 22:10:01.112680", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.159568", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.0053671300411224365, "timestamp": "2025-09-30 22:10:01.168495", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.200395", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.016355205327272415, "timestamp": "2025-09-30 22:10:01.203041", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:01.237777", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.019035687670111656, "timestamp": "2025-09-30 22:10:01.263115", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.298938", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.009006036445498466, "timestamp": "2025-09-30 22:10:01.318806", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.349258", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.00685217697173357, "timestamp": "2025-09-30 22:10:01.351550", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.384878", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.01326062437146902, "timestamp": "2025-09-30 22:10:01.387328", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.419025", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.013570294715464115, "timestamp": "2025-09-30 22:10:01.443509", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.475504", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.039509546011686325, "timestamp": "2025-09-30 22:10:01.478771", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:01.525788", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.04997308924794197, "timestamp": "2025-09-30 22:10:01.528750", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.560112", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.005994971841573715, "timestamp": "2025-09-30 22:10:01.563800", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.595169", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.018137672916054726, "timestamp": "2025-09-30 22:10:01.622396", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.654248", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.02233460359275341, "timestamp": "2025-09-30 22:10:01.658163", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.689437", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.026135217398405075, "timestamp": "2025-09-30 22:10:01.692105", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.729466", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.04665851593017578, "timestamp": "2025-09-30 22:10:01.734628", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.766347", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.012036020867526531, "timestamp": "2025-09-30 22:10:01.790289", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.826971", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.01776084490120411, "timestamp": "2025-09-30 22:10:01.829109", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:01.858951", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.00692532816901803, "timestamp": "2025-09-30 22:10:01.863459", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.904224", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.008399528451263905, "timestamp": "2025-09-30 22:10:01.906615", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:01.938699", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.012349965050816536, "timestamp": "2025-09-30 22:10:01.962845", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:01.995586", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.010101139545440674, "timestamp": "2025-09-30 22:10:01.999394", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:02.029697", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.012034410610795021, "timestamp": "2025-09-30 22:10:02.031565", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:02.063467", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.01522676832973957, "timestamp": "2025-09-30 22:10:02.065813", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:02.095464", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.007156494073569775, "timestamp": "2025-09-30 22:10:02.119029", "step": 684, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:03.026146", "step": 684, "epoch": 1 }, { "type": "pplx", "content": 75040225.03656973, "timestamp": "2025-09-30 22:10:03.027940", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.057877", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.027570990845561028, "timestamp": "2025-09-30 22:10:03.059859", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.096902", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.018303800374269485, "timestamp": "2025-09-30 22:10:03.098881", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.128674", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.005918011534959078, "timestamp": "2025-09-30 22:10:03.131031", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.162133", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.00888880342245102, "timestamp": "2025-09-30 22:10:03.185819", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.216448", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.03235236182808876, "timestamp": "2025-09-30 22:10:03.219377", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.250220", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.007580969948321581, "timestamp": "2025-09-30 22:10:03.252494", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.283394", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.011107278056442738, "timestamp": "2025-09-30 22:10:03.286750", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.329475", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.010450707748532295, "timestamp": "2025-09-30 22:10:03.353238", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.389345", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.021500239148736, "timestamp": "2025-09-30 22:10:03.391531", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.422398", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.04603993892669678, "timestamp": "2025-09-30 22:10:03.425868", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.457563", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.005174047313630581, "timestamp": "2025-09-30 22:10:03.466641", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:03.501262", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.02627837099134922, "timestamp": "2025-09-30 22:10:03.524896", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.561635", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.01084921695291996, "timestamp": "2025-09-30 22:10:03.566186", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.600203", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.011157850734889507, "timestamp": "2025-09-30 22:10:03.605230", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.639902", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.023838777095079422, "timestamp": "2025-09-30 22:10:03.651726", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.694541", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.02645074389874935, "timestamp": "2025-09-30 22:10:03.718493", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:03.760015", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.006150428671389818, "timestamp": "2025-09-30 22:10:03.762389", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.793712", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.018974557518959045, "timestamp": "2025-09-30 22:10:03.796231", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:03.827927", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.018012331798672676, "timestamp": "2025-09-30 22:10:03.833070", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.872360", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.014314976520836353, "timestamp": "2025-09-30 22:10:03.897893", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:03.929370", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.020005246624350548, "timestamp": "2025-09-30 22:10:03.933187", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:03.964732", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.006193004548549652, "timestamp": "2025-09-30 22:10:03.966911", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.000306", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.006298999767750502, "timestamp": "2025-09-30 22:10:04.002478", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.033057", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.012022623792290688, "timestamp": "2025-09-30 22:10:04.056790", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.096985", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.010289990343153477, "timestamp": "2025-09-30 22:10:04.099617", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.130289", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.04707222059369087, "timestamp": "2025-09-30 22:10:04.133120", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:04.163461", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.02272258698940277, "timestamp": "2025-09-30 22:10:04.166094", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.206302", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.034390226006507874, "timestamp": "2025-09-30 22:10:04.230759", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.261763", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.014662191271781921, "timestamp": "2025-09-30 22:10:04.263917", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.294300", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.011226105503737926, "timestamp": "2025-09-30 22:10:04.296253", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:04.327259", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.02475506253540516, "timestamp": "2025-09-30 22:10:04.330232", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.360791", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.035920243710279465, "timestamp": "2025-09-30 22:10:04.384648", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.415677", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.01731456071138382, "timestamp": "2025-09-30 22:10:04.417852", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.448702", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.015075340867042542, "timestamp": "2025-09-30 22:10:04.451638", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.482199", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.017238110303878784, "timestamp": "2025-09-30 22:10:04.484997", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:04.516673", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.028648529201745987, "timestamp": "2025-09-30 22:10:04.541086", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.571548", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.005524549167603254, "timestamp": "2025-09-30 22:10:04.573834", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.604370", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.006129828747361898, "timestamp": "2025-09-30 22:10:04.606408", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.643225", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.010930589400231838, "timestamp": "2025-09-30 22:10:04.645764", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.677728", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.02778017707169056, "timestamp": "2025-09-30 22:10:04.702056", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.733280", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.03585416451096535, "timestamp": "2025-09-30 22:10:04.737077", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:04.771163", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.021228177472949028, "timestamp": "2025-09-30 22:10:04.773894", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:04.811822", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.01692606322467327, "timestamp": "2025-09-30 22:10:04.815307", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.856289", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.009032380767166615, "timestamp": "2025-09-30 22:10:04.880861", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.913131", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.0076371668837964535, "timestamp": "2025-09-30 22:10:04.915737", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.956413", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.02644680067896843, "timestamp": "2025-09-30 22:10:04.959566", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:04.990070", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.010779125615954399, "timestamp": "2025-09-30 22:10:04.993057", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.027996", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.023335661739110947, "timestamp": "2025-09-30 22:10:05.051825", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:05.081923", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.01767139323055744, "timestamp": "2025-09-30 22:10:05.084638", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:05.115460", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.011058658361434937, "timestamp": "2025-09-30 22:10:05.118036", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:05.148161", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.014833243563771248, "timestamp": "2025-09-30 22:10:05.150681", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.186507", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.012499203905463219, "timestamp": "2025-09-30 22:10:05.210091", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.241766", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.027693988755345345, "timestamp": "2025-09-30 22:10:05.243953", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.274186", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.012153781950473785, "timestamp": "2025-09-30 22:10:05.276161", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.306651", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.020603006705641747, "timestamp": "2025-09-30 22:10:05.308770", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:05.339346", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.0332343615591526, "timestamp": "2025-09-30 22:10:05.363220", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:05.394412", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.026437604799866676, "timestamp": "2025-09-30 22:10:05.397857", "step": 741, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:06.220183", "step": 741, "epoch": 1 }, { "type": "pplx", "content": 75644240.30694193, "timestamp": "2025-09-30 22:10:06.222609", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.252650", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.01217217929661274, "timestamp": "2025-09-30 22:10:06.255053", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.287688", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.014330956153571606, "timestamp": "2025-09-30 22:10:06.289835", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:06.320607", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.034744616597890854, "timestamp": "2025-09-30 22:10:06.344390", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.377075", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.015021897852420807, "timestamp": "2025-09-30 22:10:06.379763", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.410778", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.011597558856010437, "timestamp": "2025-09-30 22:10:06.413225", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.445145", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.009742917492985725, "timestamp": "2025-09-30 22:10:06.447374", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.478670", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.029180850833654404, "timestamp": "2025-09-30 22:10:06.508907", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.539174", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.01945856213569641, "timestamp": "2025-09-30 22:10:06.541367", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.571909", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.014946605078876019, "timestamp": "2025-09-30 22:10:06.574379", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.606384", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.020322533324360847, "timestamp": "2025-09-30 22:10:06.609089", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.639777", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.03409485146403313, "timestamp": "2025-09-30 22:10:06.663743", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.696549", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.018665427342057228, "timestamp": "2025-09-30 22:10:06.699250", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:06.731448", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.01316206157207489, "timestamp": "2025-09-30 22:10:06.733644", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:06.764588", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.017740074545145035, "timestamp": "2025-09-30 22:10:06.768527", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.801537", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.022789278998970985, "timestamp": "2025-09-30 22:10:06.825231", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.855432", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.006472249049693346, "timestamp": "2025-09-30 22:10:06.857852", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.889451", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.0057207318022847176, "timestamp": "2025-09-30 22:10:06.891417", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:06.921905", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.010666334070265293, "timestamp": "2025-09-30 22:10:06.924240", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:06.954759", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.012298032641410828, "timestamp": "2025-09-30 22:10:06.979562", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.010569", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.004690289031714201, "timestamp": "2025-09-30 22:10:07.014921", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:07.045047", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.008399398066103458, "timestamp": "2025-09-30 22:10:07.047650", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.077913", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.03833383321762085, "timestamp": "2025-09-30 22:10:07.088868", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.125384", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.011823548004031181, "timestamp": "2025-09-30 22:10:07.149206", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.180770", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.008272853679955006, "timestamp": "2025-09-30 22:10:07.184020", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.215207", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.01553357858210802, "timestamp": "2025-09-30 22:10:07.218279", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.252165", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.008146891370415688, "timestamp": "2025-09-30 22:10:07.255040", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.286700", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.04488224908709526, "timestamp": "2025-09-30 22:10:07.311102", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.341855", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.014228535816073418, "timestamp": "2025-09-30 22:10:07.343860", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.373883", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.03609326854348183, "timestamp": "2025-09-30 22:10:07.375919", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:07.410372", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.02611478976905346, "timestamp": "2025-09-30 22:10:07.413795", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:07.444880", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.004257719498127699, "timestamp": "2025-09-30 22:10:07.468544", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.499657", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.05273055285215378, "timestamp": "2025-09-30 22:10:07.501850", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.532096", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.0034468381199985743, "timestamp": "2025-09-30 22:10:07.534575", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:07.567005", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.0350152887403965, "timestamp": "2025-09-30 22:10:07.569682", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:07.600602", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.029106054455041885, "timestamp": "2025-09-30 22:10:07.624757", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:07.656333", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.005838477518409491, "timestamp": "2025-09-30 22:10:07.658869", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.690631", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.0022769535426050425, "timestamp": "2025-09-30 22:10:07.693302", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:07.724407", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.026938190683722496, "timestamp": "2025-09-30 22:10:07.726716", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.758892", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.0065045664086937904, "timestamp": "2025-09-30 22:10:07.784913", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.815369", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.021315261721611023, "timestamp": "2025-09-30 22:10:07.817962", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:07.856826", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.012665718793869019, "timestamp": "2025-09-30 22:10:07.859628", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.890650", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.023632079362869263, "timestamp": "2025-09-30 22:10:07.892741", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:07.924441", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.015980595722794533, "timestamp": "2025-09-30 22:10:07.948390", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:07.978929", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.005975284148007631, "timestamp": "2025-09-30 22:10:07.980917", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:08.013879", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.00365698104724288, "timestamp": "2025-09-30 22:10:08.016923", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.048097", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.005156506318598986, "timestamp": "2025-09-30 22:10:08.050351", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.081140", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.02221284620463848, "timestamp": "2025-09-30 22:10:08.104662", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.138633", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.011932899244129658, "timestamp": "2025-09-30 22:10:08.141098", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:08.173020", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.005140057764947414, "timestamp": "2025-09-30 22:10:08.175327", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.207913", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.015069144777953625, "timestamp": "2025-09-30 22:10:08.210280", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.242050", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.01991919055581093, "timestamp": "2025-09-30 22:10:08.265688", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.297469", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.011542352847754955, "timestamp": "2025-09-30 22:10:08.300828", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.334971", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.03699946776032448, "timestamp": "2025-09-30 22:10:08.343172", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.374120", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.011802605353295803, "timestamp": "2025-09-30 22:10:08.376597", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.415508", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.0063971565105021, "timestamp": "2025-09-30 22:10:08.440533", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:08.472313", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.010037742555141449, "timestamp": "2025-09-30 22:10:08.475351", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:08.509320", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.01431556511670351, "timestamp": "2025-09-30 22:10:08.511646", "step": 798, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:09.292369", "step": 798, "epoch": 1 }, { "type": "pplx", "content": 76406729.60806397, "timestamp": "2025-09-30 22:10:09.294385", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.324329", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.006856878288090229, "timestamp": "2025-09-30 22:10:09.326823", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.360363", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.037827786058187485, "timestamp": "2025-09-30 22:10:09.384515", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.433939", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.025704285129904747, "timestamp": "2025-09-30 22:10:09.437939", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:09.479720", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.0315730981528759, "timestamp": "2025-09-30 22:10:09.485052", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.535661", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.026073981076478958, "timestamp": "2025-09-30 22:10:09.540638", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.588412", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.009723112918436527, "timestamp": "2025-09-30 22:10:09.614821", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.652465", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.02134951762855053, "timestamp": "2025-09-30 22:10:09.657368", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.712952", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.02399459108710289, "timestamp": "2025-09-30 22:10:09.716284", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.752627", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.012168281711637974, "timestamp": "2025-09-30 22:10:09.755850", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:09.791768", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.01669606752693653, "timestamp": "2025-09-30 22:10:09.819829", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:09.857837", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.008840478025376797, "timestamp": "2025-09-30 22:10:09.863110", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:09.902502", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.015332608483731747, "timestamp": "2025-09-30 22:10:09.907137", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.945104", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.02074304223060608, "timestamp": "2025-09-30 22:10:09.948780", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:09.987526", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.012077617458999157, "timestamp": "2025-09-30 22:10:10.013670", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.054173", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.02191738598048687, "timestamp": "2025-09-30 22:10:10.061034", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.096190", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.0168896596878767, "timestamp": "2025-09-30 22:10:10.099881", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.139451", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.00681196479126811, "timestamp": "2025-09-30 22:10:10.142371", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.179859", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.0110024968162179, "timestamp": "2025-09-30 22:10:10.203904", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.241164", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.031019117683172226, "timestamp": "2025-09-30 22:10:10.244077", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.284673", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.014357957057654858, "timestamp": "2025-09-30 22:10:10.288383", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.332432", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.0071715316735208035, "timestamp": "2025-09-30 22:10:10.335084", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.374737", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.01265205442905426, "timestamp": "2025-09-30 22:10:10.399174", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:10.445249", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.011570468544960022, "timestamp": "2025-09-30 22:10:10.448305", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.488167", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.012007759883999825, "timestamp": "2025-09-30 22:10:10.496619", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.529828", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.01156451366841793, "timestamp": "2025-09-30 22:10:10.533774", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:10.566844", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.027540789917111397, "timestamp": "2025-09-30 22:10:10.591111", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:10.629161", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.009911253117024899, "timestamp": "2025-09-30 22:10:10.632734", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.668861", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.016379550099372864, "timestamp": "2025-09-30 22:10:10.671556", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.708865", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.02639281377196312, "timestamp": "2025-09-30 22:10:10.712158", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:10.756565", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.012039894238114357, "timestamp": "2025-09-30 22:10:10.781013", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.812058", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.014376694336533546, "timestamp": "2025-09-30 22:10:10.814848", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.846815", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.004688502289354801, "timestamp": "2025-09-30 22:10:10.849426", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.882726", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.007329548709094524, "timestamp": "2025-09-30 22:10:10.885788", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.917811", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.006236738059669733, "timestamp": "2025-09-30 22:10:10.950301", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:10.984538", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.028410309925675392, "timestamp": "2025-09-30 22:10:10.987981", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.022303", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.011889264918863773, "timestamp": "2025-09-30 22:10:11.026246", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:11.062382", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.0037209205329418182, "timestamp": "2025-09-30 22:10:11.066107", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.107789", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.00844674650579691, "timestamp": "2025-09-30 22:10:11.132646", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.191113", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.03872412443161011, "timestamp": "2025-09-30 22:10:11.195191", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.231328", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.0209845881909132, "timestamp": "2025-09-30 22:10:11.235063", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.271720", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.03015008568763733, "timestamp": "2025-09-30 22:10:11.276000", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:11.317072", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.012239743955433369, "timestamp": "2025-09-30 22:10:11.342734", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.383789", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.014199174009263515, "timestamp": "2025-09-30 22:10:11.388549", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.423823", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.02206958830356598, "timestamp": "2025-09-30 22:10:11.426955", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.467847", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.017280694097280502, "timestamp": "2025-09-30 22:10:11.471294", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.504526", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.01756494864821434, "timestamp": "2025-09-30 22:10:11.529370", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.569818", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.006199831608682871, "timestamp": "2025-09-30 22:10:11.572811", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.606421", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.001826465129852295, "timestamp": "2025-09-30 22:10:11.609304", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.645536", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.02917005680501461, "timestamp": "2025-09-30 22:10:11.648355", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.681622", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.02054743655025959, "timestamp": "2025-09-30 22:10:11.706800", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:11.751607", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.003462132764980197, "timestamp": "2025-09-30 22:10:11.754761", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.799828", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.0037231442984193563, "timestamp": "2025-09-30 22:10:11.802639", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.844274", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.0038203117437660694, "timestamp": "2025-09-30 22:10:11.857370", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:11.898968", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.012988201342523098, "timestamp": "2025-09-30 22:10:11.923772", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:11.960516", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.005469611845910549, "timestamp": "2025-09-30 22:10:11.963453", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:12.003814", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.02971557341516018, "timestamp": "2025-09-30 22:10:12.006786", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:12.049344", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.004186688922345638, "timestamp": "2025-09-30 22:10:12.051818", "step": 855, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:13.067796", "step": 855, "epoch": 1 }, { "type": "pplx", "content": 76245623.29374737, "timestamp": "2025-09-30 22:10:13.071528", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.103783", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.004449806176126003, "timestamp": "2025-09-30 22:10:13.136091", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.170275", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.014926557429134846, "timestamp": "2025-09-30 22:10:13.179783", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:13.212624", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.009851276874542236, "timestamp": "2025-09-30 22:10:13.215334", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.253845", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.012923565693199635, "timestamp": "2025-09-30 22:10:13.264991", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.308503", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.026185166090726852, "timestamp": "2025-09-30 22:10:13.338000", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.392677", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.014344602823257446, "timestamp": "2025-09-30 22:10:13.395173", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:13.441472", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.006821572780609131, "timestamp": "2025-09-30 22:10:13.444125", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.483678", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.012688002549111843, "timestamp": "2025-09-30 22:10:13.494659", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.530177", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.008426748216152191, "timestamp": "2025-09-30 22:10:13.554357", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.587828", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.02354290522634983, "timestamp": "2025-09-30 22:10:13.591844", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:13.627896", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.0057960678823292255, "timestamp": "2025-09-30 22:10:13.631410", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.666119", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.0035158873070031404, "timestamp": "2025-09-30 22:10:13.668876", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.702186", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.01108395867049694, "timestamp": "2025-09-30 22:10:13.727805", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:13.770813", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.016525190323591232, "timestamp": "2025-09-30 22:10:13.773329", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.812572", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.04078942909836769, "timestamp": "2025-09-30 22:10:13.816809", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.849193", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.006157420575618744, "timestamp": "2025-09-30 22:10:13.851316", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.885135", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.0009152033017016947, "timestamp": "2025-09-30 22:10:13.909702", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.945406", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.012125137262046337, "timestamp": "2025-09-30 22:10:13.948015", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:13.985528", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.006319773383438587, "timestamp": "2025-09-30 22:10:13.995904", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.030647", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.023734863847494125, "timestamp": "2025-09-30 22:10:14.033912", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.070585", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.03549584373831749, "timestamp": "2025-09-30 22:10:14.096078", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:14.137320", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.003112002043053508, "timestamp": "2025-09-30 22:10:14.140961", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.175720", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.0011412525782361627, "timestamp": "2025-09-30 22:10:14.180402", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.214505", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.0346827395260334, "timestamp": "2025-09-30 22:10:14.218881", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:14.268462", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.05454268679022789, "timestamp": "2025-09-30 22:10:14.293747", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.328093", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.023915329948067665, "timestamp": "2025-09-30 22:10:14.331235", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.368028", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.03430422767996788, "timestamp": "2025-09-30 22:10:14.370830", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.412088", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.025995483621954918, "timestamp": "2025-09-30 22:10:14.415882", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:14.454009", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.0020150647033005953, "timestamp": "2025-09-30 22:10:14.479840", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.514068", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.02836998924612999, "timestamp": "2025-09-30 22:10:14.517649", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.550930", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.011780476197600365, "timestamp": "2025-09-30 22:10:14.553230", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:14.587832", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.02879740297794342, "timestamp": "2025-09-30 22:10:14.591053", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.640613", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.03621939569711685, "timestamp": "2025-09-30 22:10:14.666454", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.700269", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.019999349489808083, "timestamp": "2025-09-30 22:10:14.711139", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.746012", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.05889752879738808, "timestamp": "2025-09-30 22:10:14.748870", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.786331", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.011419662274420261, "timestamp": "2025-09-30 22:10:14.790768", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:14.825971", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.020933421328663826, "timestamp": "2025-09-30 22:10:14.851145", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:14.889528", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.02317097596824169, "timestamp": "2025-09-30 22:10:14.893180", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.929331", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.04982530325651169, "timestamp": "2025-09-30 22:10:14.932743", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:14.970798", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.014119095169007778, "timestamp": "2025-09-30 22:10:14.979403", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.013199", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.02401905320584774, "timestamp": "2025-09-30 22:10:15.037581", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.072554", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.01428347546607256, "timestamp": "2025-09-30 22:10:15.076634", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.116302", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.016454854980111122, "timestamp": "2025-09-30 22:10:15.121019", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.154705", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.016421593725681305, "timestamp": "2025-09-30 22:10:15.157974", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.190313", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.0145410830155015, "timestamp": "2025-09-30 22:10:15.214308", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.255178", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.014178029261529446, "timestamp": "2025-09-30 22:10:15.258194", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.297815", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.00803608912974596, "timestamp": "2025-09-30 22:10:15.300276", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.334865", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.015483580529689789, "timestamp": "2025-09-30 22:10:15.337619", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.377575", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.026707107201218605, "timestamp": "2025-09-30 22:10:15.401993", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.435159", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.0035559723619371653, "timestamp": "2025-09-30 22:10:15.437922", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.473476", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.006338414270430803, "timestamp": "2025-09-30 22:10:15.476934", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.511633", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.039536114782094955, "timestamp": "2025-09-30 22:10:15.517593", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.553172", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.02256901189684868, "timestamp": "2025-09-30 22:10:15.584110", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:15.625112", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.02677801065146923, "timestamp": "2025-09-30 22:10:15.627782", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.663550", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.0022334277164191008, "timestamp": "2025-09-30 22:10:15.670463", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:15.716567", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.015281999483704567, "timestamp": "2025-09-30 22:10:15.719979", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:15.763388", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.0033361141104251146, "timestamp": "2025-09-30 22:10:15.793277", "step": 912, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:16.748223", "step": 912, "epoch": 1 }, { "type": "pplx", "content": 62558746.91204297, "timestamp": "2025-09-30 22:10:16.751089", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:16.792362", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.040188103914260864, "timestamp": "2025-09-30 22:10:16.794586", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:16.827625", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.008862501941621304, "timestamp": "2025-09-30 22:10:16.834192", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:16.866079", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.004693312104791403, "timestamp": "2025-09-30 22:10:16.870428", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:16.905282", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.005864806938916445, "timestamp": "2025-09-30 22:10:16.928897", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:16.963601", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.030392948538064957, "timestamp": "2025-09-30 22:10:16.967509", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:17.029177", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.04122446849942207, "timestamp": "2025-09-30 22:10:17.033503", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.068472", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.017605489119887352, "timestamp": "2025-09-30 22:10:17.073010", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.108665", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.043698132038116455, "timestamp": "2025-09-30 22:10:17.133330", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.171458", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.01505519263446331, "timestamp": "2025-09-30 22:10:17.174223", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.213216", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.03668801113963127, "timestamp": "2025-09-30 22:10:17.218407", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.255918", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.02062925696372986, "timestamp": "2025-09-30 22:10:17.259624", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:17.298606", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.031664784997701645, "timestamp": "2025-09-30 22:10:17.324678", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.362897", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.015291580930352211, "timestamp": "2025-09-30 22:10:17.369368", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.403591", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.01260668970644474, "timestamp": "2025-09-30 22:10:17.415503", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.461527", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.020603179931640625, "timestamp": "2025-09-30 22:10:17.464519", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.506816", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.018412886187434196, "timestamp": "2025-09-30 22:10:17.531909", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.565854", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.014865756034851074, "timestamp": "2025-09-30 22:10:17.572333", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.613432", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.019409114494919777, "timestamp": "2025-09-30 22:10:17.616608", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.655128", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.016556749120354652, "timestamp": "2025-09-30 22:10:17.657337", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.689814", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.025179879739880562, "timestamp": "2025-09-30 22:10:17.716195", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.754860", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.019688468426465988, "timestamp": "2025-09-30 22:10:17.757557", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.791526", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.028649795800447464, "timestamp": "2025-09-30 22:10:17.795370", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.834687", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.013120375573635101, "timestamp": "2025-09-30 22:10:17.845972", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:17.890825", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.024192016571760178, "timestamp": "2025-09-30 22:10:17.915480", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:17.954377", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.018348783254623413, "timestamp": "2025-09-30 22:10:17.960764", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:18.013060", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.021533623337745667, "timestamp": "2025-09-30 22:10:18.017018", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.050118", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.026502570137381554, "timestamp": "2025-09-30 22:10:18.054381", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.093759", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.018051905557513237, "timestamp": "2025-09-30 22:10:18.120524", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.168520", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.02367454580962658, "timestamp": "2025-09-30 22:10:18.173392", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.211846", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.01433511357754469, "timestamp": "2025-09-30 22:10:18.215841", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:18.265855", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.013081178069114685, "timestamp": "2025-09-30 22:10:18.268716", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:18.302341", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.015170074068009853, "timestamp": "2025-09-30 22:10:18.326621", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.374861", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.023984912782907486, "timestamp": "2025-09-30 22:10:18.388290", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.437354", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.016848895698785782, "timestamp": "2025-09-30 22:10:18.440282", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.482779", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.017187591642141342, "timestamp": "2025-09-30 22:10:18.486116", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.524752", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.023166237398982048, "timestamp": "2025-09-30 22:10:18.550134", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.586866", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.018485546112060547, "timestamp": "2025-09-30 22:10:18.590276", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.625690", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.0056577036157250404, "timestamp": "2025-09-30 22:10:18.629779", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.676779", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.03402547910809517, "timestamp": "2025-09-30 22:10:18.681353", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.719368", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.007531954441219568, "timestamp": "2025-09-30 22:10:18.751320", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.784831", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.01770077273249626, "timestamp": "2025-09-30 22:10:18.788229", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.824869", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.013906643725931644, "timestamp": "2025-09-30 22:10:18.837494", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.878918", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.011043167673051357, "timestamp": "2025-09-30 22:10:18.882603", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:18.918145", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.006359127350151539, "timestamp": "2025-09-30 22:10:18.946118", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:18.980317", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.008600293658673763, "timestamp": "2025-09-30 22:10:18.984720", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.029161", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.004958875942975283, "timestamp": "2025-09-30 22:10:19.035639", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:19.073838", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.020593535155057907, "timestamp": "2025-09-30 22:10:19.084805", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.141702", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.017098385840654373, "timestamp": "2025-09-30 22:10:19.166911", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:19.200471", "step": 960, "epoch": 2 }, { "type": "loss", "content": 0.0009686704725027084, "timestamp": "2025-09-30 22:10:19.203051", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.243207", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.002368885325267911, "timestamp": "2025-09-30 22:10:19.246636", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.280102", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.0030674913432449102, "timestamp": "2025-09-30 22:10:19.284029", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.318671", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.015550940297544003, "timestamp": "2025-09-30 22:10:19.344411", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.413774", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.005516111385077238, "timestamp": "2025-09-30 22:10:19.416740", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.456668", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.0011931475019082427, "timestamp": "2025-09-30 22:10:19.460544", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:19.495950", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.02176191285252571, "timestamp": "2025-09-30 22:10:19.499482", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.545594", "step": 967, "epoch": 2 }, { "type": "loss", "content": 0.0032643016893416643, "timestamp": "2025-09-30 22:10:19.569621", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:19.602255", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.04372279718518257, "timestamp": "2025-09-30 22:10:19.618747", "step": 969, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:20.657409", "step": 969, "epoch": 2 }, { "type": "pplx", "content": 65788041.33313765, "timestamp": "2025-09-30 22:10:20.661987", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:20.693631", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.033423688262701035, "timestamp": "2025-09-30 22:10:20.698865", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:20.738094", "step": 970, "epoch": 2 }, { "type": "loss", "content": 0.030283639207482338, "timestamp": "2025-09-30 22:10:20.744640", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:20.779689", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.015880217775702477, "timestamp": "2025-09-30 22:10:20.803928", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:20.844820", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.022766422480344772, "timestamp": "2025-09-30 22:10:20.848562", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:20.883473", "step": 973, "epoch": 2 }, { "type": "loss", "content": 0.013303340412676334, "timestamp": "2025-09-30 22:10:20.887367", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:20.921327", "step": 974, "epoch": 2 }, { "type": "loss", "content": 0.012033392675220966, "timestamp": "2025-09-30 22:10:20.926664", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:20.963256", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.020994653925299644, "timestamp": "2025-09-30 22:10:20.990087", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.028001", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.035628240555524826, "timestamp": "2025-09-30 22:10:21.033008", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.067850", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.008654872886836529, "timestamp": "2025-09-30 22:10:21.072430", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:21.110669", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.016928749158978462, "timestamp": "2025-09-30 22:10:21.115382", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:21.159459", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.002923868829384446, "timestamp": "2025-09-30 22:10:21.185317", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.220838", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.005316472612321377, "timestamp": "2025-09-30 22:10:21.224686", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.270451", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.005148599855601788, "timestamp": "2025-09-30 22:10:21.280768", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:21.316564", "step": 982, "epoch": 2 }, { "type": "loss", "content": 0.03270730748772621, "timestamp": "2025-09-30 22:10:21.320244", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.357069", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.005614656023681164, "timestamp": "2025-09-30 22:10:21.382552", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.420065", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.01812359131872654, "timestamp": "2025-09-30 22:10:21.423886", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:21.465694", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.02315768599510193, "timestamp": "2025-09-30 22:10:21.470914", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.508553", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.030391912907361984, "timestamp": "2025-09-30 22:10:21.512422", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.548808", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.0035188833717256784, "timestamp": "2025-09-30 22:10:21.573594", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.613076", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.03694489970803261, "timestamp": "2025-09-30 22:10:21.618536", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:21.661314", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.00450171111151576, "timestamp": "2025-09-30 22:10:21.666952", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.705355", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.041242681443691254, "timestamp": "2025-09-30 22:10:21.711881", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.751963", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.019213315099477768, "timestamp": "2025-09-30 22:10:21.778208", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:21.816531", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.021348316222429276, "timestamp": "2025-09-30 22:10:21.819610", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.862860", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.025651155039668083, "timestamp": "2025-09-30 22:10:21.866330", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.902766", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.017335962504148483, "timestamp": "2025-09-30 22:10:21.906886", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:21.942458", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.02041892521083355, "timestamp": "2025-09-30 22:10:21.968595", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:22.002063", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.019564051181077957, "timestamp": "2025-09-30 22:10:22.006678", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:22.040636", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.028469962999224663, "timestamp": "2025-09-30 22:10:22.044203", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:22.081715", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.02833707444369793, "timestamp": "2025-09-30 22:10:22.084474", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:22.119282", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.0041335872374475, "timestamp": "2025-09-30 22:10:22.144197", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-09-30 22:10:28.959668", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.015366", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.02058243192732334, "timestamp": "2025-09-30 22:10:29.018041", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.057796", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.019590478390455246, "timestamp": "2025-09-30 22:10:29.062434", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.100356", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 0.02721039578318596, "timestamp": "2025-09-30 22:10:29.102644", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.138940", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.008601582609117031, "timestamp": "2025-09-30 22:10:29.163786", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.194502", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.01570257730782032, "timestamp": "2025-09-30 22:10:29.197243", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.229643", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.024934740737080574, "timestamp": "2025-09-30 22:10:29.235385", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:29.267684", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.02274438552558422, "timestamp": "2025-09-30 22:10:29.273751", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.314877", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.03345928713679314, "timestamp": "2025-09-30 22:10:29.340342", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:29.372546", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.023872241377830505, "timestamp": "2025-09-30 22:10:29.374923", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.410290", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.013263803906738758, "timestamp": "2025-09-30 22:10:29.412737", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.449315", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.008637587539851665, "timestamp": "2025-09-30 22:10:29.452054", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.483839", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.025991391390562057, "timestamp": "2025-09-30 22:10:29.507270", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.539243", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.03899989277124405, "timestamp": "2025-09-30 22:10:29.541750", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.572992", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.023212125524878502, "timestamp": "2025-09-30 22:10:29.576482", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.608541", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.016156161203980446, "timestamp": "2025-09-30 22:10:29.610741", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.642214", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.005735450424253941, "timestamp": "2025-09-30 22:10:29.666732", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.699084", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.022412346675992012, "timestamp": "2025-09-30 22:10:29.701194", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.731552", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.02042771875858307, "timestamp": "2025-09-30 22:10:29.733969", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.766593", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 0.011804310604929924, "timestamp": "2025-09-30 22:10:29.770584", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.803087", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.018140995875000954, "timestamp": "2025-09-30 22:10:29.831209", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.862791", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.026703549548983574, "timestamp": "2025-09-30 22:10:29.867303", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.904426", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.038636982440948486, "timestamp": "2025-09-30 22:10:29.907011", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:29.938166", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.0022895862348377705, "timestamp": "2025-09-30 22:10:29.940260", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:29.972463", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.010432879440486431, "timestamp": "2025-09-30 22:10:29.996867", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:30.029207", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.04155639931559563, "timestamp": "2025-09-30 22:10:30.034069", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:30.064974", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.05263324826955795, "timestamp": "2025-09-30 22:10:30.067364", "step": 1026, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:30.910493", "step": 1026, "epoch": 2 }, { "type": "pplx", "content": 60215314.01973439, "timestamp": "2025-09-30 22:10:30.913813", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:30.945897", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.0612885057926178, "timestamp": "2025-09-30 22:10:30.948322", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:30.982165", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.002322312444448471, "timestamp": "2025-09-30 22:10:31.005970", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.040105", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.01675250194966793, "timestamp": "2025-09-30 22:10:31.042778", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.075684", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.012638731859624386, "timestamp": "2025-09-30 22:10:31.079022", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.123037", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.012278775684535503, "timestamp": "2025-09-30 22:10:31.130874", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.165879", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.0036765080876648426, "timestamp": "2025-09-30 22:10:31.191433", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:31.227516", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.017122618854045868, "timestamp": "2025-09-30 22:10:31.231045", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.264383", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.01866624876856804, "timestamp": "2025-09-30 22:10:31.267352", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.309810", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.017900409176945686, "timestamp": "2025-09-30 22:10:31.312897", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.346729", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.006202004384249449, "timestamp": "2025-09-30 22:10:31.371977", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.407889", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.0101151829585433, "timestamp": "2025-09-30 22:10:31.412188", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.444937", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.038425859063863754, "timestamp": "2025-09-30 22:10:31.448352", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:31.480931", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.01929587312042713, "timestamp": "2025-09-30 22:10:31.483725", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:31.518714", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.024310678243637085, "timestamp": "2025-09-30 22:10:31.543127", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.578292", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.018113570287823677, "timestamp": "2025-09-30 22:10:31.581908", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.623671", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.009870938025414944, "timestamp": "2025-09-30 22:10:31.626544", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.661505", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.03137834742665291, "timestamp": "2025-09-30 22:10:31.669968", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:31.705016", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.016941692680120468, "timestamp": "2025-09-30 22:10:31.730020", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.765093", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.008219278417527676, "timestamp": "2025-09-30 22:10:31.768958", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.804876", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.012652714736759663, "timestamp": "2025-09-30 22:10:31.807304", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.854911", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.03563131019473076, "timestamp": "2025-09-30 22:10:31.858971", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:31.894960", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.01216198317706585, "timestamp": "2025-09-30 22:10:31.920815", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.955629", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 0.010561487637460232, "timestamp": "2025-09-30 22:10:31.958977", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:31.992295", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.011876196600496769, "timestamp": "2025-09-30 22:10:31.995172", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.027651", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.030352991074323654, "timestamp": "2025-09-30 22:10:32.030753", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:32.071016", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.009673316963016987, "timestamp": "2025-09-30 22:10:32.096968", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.140199", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.020116552710533142, "timestamp": "2025-09-30 22:10:32.143690", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:32.181024", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.02689552865922451, "timestamp": "2025-09-30 22:10:32.190727", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.232982", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.02992871031165123, "timestamp": "2025-09-30 22:10:32.239639", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.283318", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.03253794461488724, "timestamp": "2025-09-30 22:10:32.310398", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.346640", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.020867738872766495, "timestamp": "2025-09-30 22:10:32.350082", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.382053", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.02346712537109852, "timestamp": "2025-09-30 22:10:32.384750", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.423038", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.03169684857130051, "timestamp": "2025-09-30 22:10:32.427230", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.470540", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.014368179254233837, "timestamp": "2025-09-30 22:10:32.513006", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.568627", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.01384419109672308, "timestamp": "2025-09-30 22:10:32.571411", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.608139", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.01648283377289772, "timestamp": "2025-09-30 22:10:32.612430", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.651956", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.006230255123227835, "timestamp": "2025-09-30 22:10:32.655386", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.689999", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.010819241404533386, "timestamp": "2025-09-30 22:10:32.714781", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.746809", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.03453322499990463, "timestamp": "2025-09-30 22:10:32.751294", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:32.787396", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.02853448875248432, "timestamp": "2025-09-30 22:10:32.790825", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.839603", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 0.027251912280917168, "timestamp": "2025-09-30 22:10:32.842832", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.882368", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 0.009377564303576946, "timestamp": "2025-09-30 22:10:32.906767", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:32.951650", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 0.004282251000404358, "timestamp": "2025-09-30 22:10:32.954283", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:32.989830", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.011062629520893097, "timestamp": "2025-09-30 22:10:33.000125", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:33.042147", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 0.02192043885588646, "timestamp": "2025-09-30 22:10:33.045852", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.079439", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.028622159734368324, "timestamp": "2025-09-30 22:10:33.103224", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.139353", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.010093754157423973, "timestamp": "2025-09-30 22:10:33.142699", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.179786", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.0439641959965229, "timestamp": "2025-09-30 22:10:33.193079", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.243822", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.0537438802421093, "timestamp": "2025-09-30 22:10:33.252776", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:33.291132", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.050223346799612045, "timestamp": "2025-09-30 22:10:33.316240", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.355997", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.02958056330680847, "timestamp": "2025-09-30 22:10:33.360699", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.394024", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.020796921104192734, "timestamp": "2025-09-30 22:10:33.399000", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.434126", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.016962138935923576, "timestamp": "2025-09-30 22:10:33.437605", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.478263", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 0.01815619505941868, "timestamp": "2025-09-30 22:10:33.502407", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:33.541546", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.01795378513634205, "timestamp": "2025-09-30 22:10:33.544428", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:33.588742", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.028307702392339706, "timestamp": "2025-09-30 22:10:33.600113", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:33.641511", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.020032325759530067, "timestamp": "2025-09-30 22:10:33.645522", "step": 1083, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:34.585328", "step": 1083, "epoch": 2 }, { "type": "pplx", "content": 50986278.02673556, "timestamp": "2025-09-30 22:10:34.588298", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:34.625695", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.014734785072505474, "timestamp": "2025-09-30 22:10:34.649595", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:34.683388", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.005796567536890507, "timestamp": "2025-09-30 22:10:34.686190", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:34.729315", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 0.013309342786669731, "timestamp": "2025-09-30 22:10:34.737398", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:34.773113", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.02818778157234192, "timestamp": "2025-09-30 22:10:34.776889", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:34.813976", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.009719455614686012, "timestamp": "2025-09-30 22:10:34.839046", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:34.878814", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.032015398144721985, "timestamp": "2025-09-30 22:10:34.882086", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:34.922540", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.017986981198191643, "timestamp": "2025-09-30 22:10:34.927130", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:34.959720", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.041368067264556885, "timestamp": "2025-09-30 22:10:34.962550", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.001883", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 0.007110454607754946, "timestamp": "2025-09-30 22:10:35.034183", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:35.067238", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.01783628761768341, "timestamp": "2025-09-30 22:10:35.070875", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.116127", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.013988708145916462, "timestamp": "2025-09-30 22:10:35.119130", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.155784", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.010679191909730434, "timestamp": "2025-09-30 22:10:35.158039", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.193822", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.006078497972339392, "timestamp": "2025-09-30 22:10:35.222861", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.254498", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.015893442556262016, "timestamp": "2025-09-30 22:10:35.257176", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.288882", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.022868171334266663, "timestamp": "2025-09-30 22:10:35.292145", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.324755", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.01892166957259178, "timestamp": "2025-09-30 22:10:35.327699", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.361135", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.024770379066467285, "timestamp": "2025-09-30 22:10:35.385763", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.429158", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.025294585153460503, "timestamp": "2025-09-30 22:10:35.432535", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:35.465816", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.010897427797317505, "timestamp": "2025-09-30 22:10:35.468891", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.501198", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.018574992194771767, "timestamp": "2025-09-30 22:10:35.503695", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.534900", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.015447559766471386, "timestamp": "2025-09-30 22:10:35.559792", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.593465", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.011934633366763592, "timestamp": "2025-09-30 22:10:35.596557", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.633372", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.009000704623758793, "timestamp": "2025-09-30 22:10:35.635916", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.677150", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.02742808684706688, "timestamp": "2025-09-30 22:10:35.679880", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.714544", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.02327750436961651, "timestamp": "2025-09-30 22:10:35.739124", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:35.773463", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.011305660009384155, "timestamp": "2025-09-30 22:10:35.776833", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.814782", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.020335091277956963, "timestamp": "2025-09-30 22:10:35.817105", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.850752", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.019419783726334572, "timestamp": "2025-09-30 22:10:35.853649", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.890700", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.029895620420575142, "timestamp": "2025-09-30 22:10:35.914372", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:35.948609", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.02316124178469181, "timestamp": "2025-09-30 22:10:35.951953", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:35.984088", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.011370803229510784, "timestamp": "2025-09-30 22:10:35.987455", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:36.021144", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.020221199840307236, "timestamp": "2025-09-30 22:10:36.024007", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:36.056691", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.016114765778183937, "timestamp": "2025-09-30 22:10:36.081163", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.113278", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.023773958906531334, "timestamp": "2025-09-30 22:10:36.115790", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:36.147247", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.012325162068009377, "timestamp": "2025-09-30 22:10:36.150566", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.190399", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.018607692793011665, "timestamp": "2025-09-30 22:10:36.192791", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.225227", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.027039309963583946, "timestamp": "2025-09-30 22:10:36.249980", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:36.282406", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.025002550333738327, "timestamp": "2025-09-30 22:10:36.284766", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:36.315779", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.00902754720300436, "timestamp": "2025-09-30 22:10:36.321046", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.353064", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.012607453390955925, "timestamp": "2025-09-30 22:10:36.358797", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.390125", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.015060718171298504, "timestamp": "2025-09-30 22:10:36.413933", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.445249", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.006960070692002773, "timestamp": "2025-09-30 22:10:36.447616", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.478822", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.024447765201330185, "timestamp": "2025-09-30 22:10:36.482178", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.522154", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 0.030384311452507973, "timestamp": "2025-09-30 22:10:36.524127", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.558656", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.006873700302094221, "timestamp": "2025-09-30 22:10:36.582615", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.613650", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.04473824054002762, "timestamp": "2025-09-30 22:10:36.616690", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.650084", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.03146163001656532, "timestamp": "2025-09-30 22:10:36.652845", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.685683", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.012721200473606586, "timestamp": "2025-09-30 22:10:36.694070", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.728145", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.011350330896675587, "timestamp": "2025-09-30 22:10:36.753076", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.785770", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.023074982687830925, "timestamp": "2025-09-30 22:10:36.788665", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:36.829646", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.01878601871430874, "timestamp": "2025-09-30 22:10:36.838847", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:36.876824", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.021561546251177788, "timestamp": "2025-09-30 22:10:36.879271", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.921248", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.010309815406799316, "timestamp": "2025-09-30 22:10:36.949775", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:36.987081", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.006379503291100264, "timestamp": "2025-09-30 22:10:36.992987", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:37.033910", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.01991991326212883, "timestamp": "2025-09-30 22:10:37.036186", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:37.075130", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.011105488054454327, "timestamp": "2025-09-30 22:10:37.078528", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:37.111543", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.0210857093334198, "timestamp": "2025-09-30 22:10:37.135257", "step": 1140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:38.006888", "step": 1140, "epoch": 2 }, { "type": "pplx", "content": 53247525.687543064, "timestamp": "2025-09-30 22:10:38.011543", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.041302", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.025197362527251244, "timestamp": "2025-09-30 22:10:38.043813", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.076848", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.009555136784911156, "timestamp": "2025-09-30 22:10:38.079531", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.113845", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.019450949504971504, "timestamp": "2025-09-30 22:10:38.116681", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.149134", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.031687263399362564, "timestamp": "2025-09-30 22:10:38.173690", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:38.205600", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.0040702177211642265, "timestamp": "2025-09-30 22:10:38.210686", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:38.242816", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.0202677883207798, "timestamp": "2025-09-30 22:10:38.244984", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.276778", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.010932053439319134, "timestamp": "2025-09-30 22:10:38.283354", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.317640", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.00964405108243227, "timestamp": "2025-09-30 22:10:38.342124", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.384688", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.02433297224342823, "timestamp": "2025-09-30 22:10:38.387023", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:38.426732", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.01564393751323223, "timestamp": "2025-09-30 22:10:38.432005", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.477250", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.03086140565574169, "timestamp": "2025-09-30 22:10:38.481359", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.523262", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.01326081994920969, "timestamp": "2025-09-30 22:10:38.547718", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.580312", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.015160572715103626, "timestamp": "2025-09-30 22:10:38.582645", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:38.615338", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.009467801079154015, "timestamp": "2025-09-30 22:10:38.617796", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.653777", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 0.025078266859054565, "timestamp": "2025-09-30 22:10:38.657845", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.691646", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.02187494933605194, "timestamp": "2025-09-30 22:10:38.715282", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.746226", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 0.013467269018292427, "timestamp": "2025-09-30 22:10:38.748824", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:38.780337", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 0.008198106661438942, "timestamp": "2025-09-30 22:10:38.782715", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.817716", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.012998079881072044, "timestamp": "2025-09-30 22:10:38.826299", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.857572", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.009026200510561466, "timestamp": "2025-09-30 22:10:38.882266", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:38.913813", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.009720182977616787, "timestamp": "2025-09-30 22:10:38.916040", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.952656", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.025005189701914787, "timestamp": "2025-09-30 22:10:38.955306", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:38.992268", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.01020416896790266, "timestamp": "2025-09-30 22:10:38.998210", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.032297", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.020846812054514885, "timestamp": "2025-09-30 22:10:39.055949", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.092365", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.014476138167083263, "timestamp": "2025-09-30 22:10:39.095173", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.127332", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.016266265884041786, "timestamp": "2025-09-30 22:10:39.130837", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.165870", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.009876398369669914, "timestamp": "2025-09-30 22:10:39.168650", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.204173", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.03922674432396889, "timestamp": "2025-09-30 22:10:39.228783", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.260215", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.016972580924630165, "timestamp": "2025-09-30 22:10:39.262676", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.295458", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.010892154648900032, "timestamp": "2025-09-30 22:10:39.298615", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.331862", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.01617332175374031, "timestamp": "2025-09-30 22:10:39.335230", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.372021", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 0.00916491262614727, "timestamp": "2025-09-30 22:10:39.396075", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:39.430431", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.013127158395946026, "timestamp": "2025-09-30 22:10:39.433742", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.475734", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.011959279887378216, "timestamp": "2025-09-30 22:10:39.485008", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.518082", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.02032916434109211, "timestamp": "2025-09-30 22:10:39.521480", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.558384", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.008914561942219734, "timestamp": "2025-09-30 22:10:39.583458", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.616504", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.0263129323720932, "timestamp": "2025-09-30 22:10:39.619293", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.652131", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.014628012664616108, "timestamp": "2025-09-30 22:10:39.660690", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.697143", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.03191584721207619, "timestamp": "2025-09-30 22:10:39.704771", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:39.739703", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.012460844591259956, "timestamp": "2025-09-30 22:10:39.770718", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:39.813104", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.030424175783991814, "timestamp": "2025-09-30 22:10:39.816536", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.851579", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.03738018870353699, "timestamp": "2025-09-30 22:10:39.855520", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:39.891473", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.01585381105542183, "timestamp": "2025-09-30 22:10:39.894459", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:39.928384", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.005708039738237858, "timestamp": "2025-09-30 22:10:39.952491", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:39.983259", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.024410611018538475, "timestamp": "2025-09-30 22:10:39.986286", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:40.019579", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.013256765902042389, "timestamp": "2025-09-30 22:10:40.022452", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.060396", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.0037524027284234762, "timestamp": "2025-09-30 22:10:40.063713", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.095383", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.013813267461955547, "timestamp": "2025-09-30 22:10:40.119098", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:40.151785", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.008720348589122295, "timestamp": "2025-09-30 22:10:40.154135", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.194273", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.03764233738183975, "timestamp": "2025-09-30 22:10:40.197284", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:40.229457", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.008992317132651806, "timestamp": "2025-09-30 22:10:40.231659", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.270538", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 0.022100094705820084, "timestamp": "2025-09-30 22:10:40.294475", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:40.336864", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.014298759400844574, "timestamp": "2025-09-30 22:10:40.339011", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.369955", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 0.0014098555548116565, "timestamp": "2025-09-30 22:10:40.372422", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:40.405316", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.04216473549604416, "timestamp": "2025-09-30 22:10:40.407930", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:40.444291", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.01196068711578846, "timestamp": "2025-09-30 22:10:40.467886", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:40.502332", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 0.052920062094926834, "timestamp": "2025-09-30 22:10:40.504536", "step": 1197, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:41.305801", "step": 1197, "epoch": 2 }, { "type": "pplx", "content": 59698730.953466825, "timestamp": "2025-09-30 22:10:41.308000", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.338068", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.008483153767883778, "timestamp": "2025-09-30 22:10:41.340400", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.370878", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 0.014434531331062317, "timestamp": "2025-09-30 22:10:41.372898", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.406684", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.024134354665875435, "timestamp": "2025-09-30 22:10:41.430663", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:41.461622", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.04929163679480553, "timestamp": "2025-09-30 22:10:41.463650", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.494393", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.003844610182568431, "timestamp": "2025-09-30 22:10:41.496374", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.527507", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.002458305796608329, "timestamp": "2025-09-30 22:10:41.529486", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:41.560332", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.004756701644510031, "timestamp": "2025-09-30 22:10:41.584038", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:41.619849", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 0.03479059413075447, "timestamp": "2025-09-30 22:10:41.622446", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.656968", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.03355806693434715, "timestamp": "2025-09-30 22:10:41.660136", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.691619", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.005543780978769064, "timestamp": "2025-09-30 22:10:41.695076", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.727961", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.015537229366600513, "timestamp": "2025-09-30 22:10:41.751862", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:41.782765", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.015018069185316563, "timestamp": "2025-09-30 22:10:41.786493", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.816754", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.021458139643073082, "timestamp": "2025-09-30 22:10:41.818847", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.850116", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.02499069646000862, "timestamp": "2025-09-30 22:10:41.852460", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.886423", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.018161950632929802, "timestamp": "2025-09-30 22:10:41.910214", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:41.941920", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.011252237483859062, "timestamp": "2025-09-30 22:10:41.944264", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:41.975427", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.012444576248526573, "timestamp": "2025-09-30 22:10:41.977920", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.012716", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 0.017678599804639816, "timestamp": "2025-09-30 22:10:42.015014", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:42.051354", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.0251413993537426, "timestamp": "2025-09-30 22:10:42.075701", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:42.115875", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 0.016345879063010216, "timestamp": "2025-09-30 22:10:42.118125", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.148326", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.004925866145640612, "timestamp": "2025-09-30 22:10:42.150898", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:42.182156", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.010515101253986359, "timestamp": "2025-09-30 22:10:42.184457", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.216338", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.012195057235658169, "timestamp": "2025-09-30 22:10:42.240431", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.271192", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 0.010490444488823414, "timestamp": "2025-09-30 22:10:42.273297", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.303573", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.00895109586417675, "timestamp": "2025-09-30 22:10:42.312752", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.343995", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.00880126841366291, "timestamp": "2025-09-30 22:10:42.346324", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.377020", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.00783101562410593, "timestamp": "2025-09-30 22:10:42.400496", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.434686", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.017458317801356316, "timestamp": "2025-09-30 22:10:42.436971", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.467577", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.004970931448042393, "timestamp": "2025-09-30 22:10:42.469901", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.499557", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.008084835484623909, "timestamp": "2025-09-30 22:10:42.501741", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.531672", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.01121762115508318, "timestamp": "2025-09-30 22:10:42.555461", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.585346", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.00550410570576787, "timestamp": "2025-09-30 22:10:42.587240", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.620449", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.031140608713030815, "timestamp": "2025-09-30 22:10:42.625597", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.656422", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 0.009130788035690784, "timestamp": "2025-09-30 22:10:42.658985", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.690752", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.018223153427243233, "timestamp": "2025-09-30 22:10:42.714475", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.745899", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.002786520402878523, "timestamp": "2025-09-30 22:10:42.749478", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.781570", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.008198206312954426, "timestamp": "2025-09-30 22:10:42.783841", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:42.813392", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.019522110000252724, "timestamp": "2025-09-30 22:10:42.815411", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.845737", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 0.009420646354556084, "timestamp": "2025-09-30 22:10:42.869305", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:42.900018", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.013096547685563564, "timestamp": "2025-09-30 22:10:42.903026", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:42.933896", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.01162264309823513, "timestamp": "2025-09-30 22:10:42.935993", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.967244", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.027476562187075615, "timestamp": "2025-09-30 22:10:42.969326", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:42.999832", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.023600177839398384, "timestamp": "2025-09-30 22:10:43.023629", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:43.056349", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.008278830908238888, "timestamp": "2025-09-30 22:10:43.058887", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:43.089761", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.01494657527655363, "timestamp": "2025-09-30 22:10:43.092061", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.123047", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.021537726745009422, "timestamp": "2025-09-30 22:10:43.125760", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.155387", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.030687665566802025, "timestamp": "2025-09-30 22:10:43.179595", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:43.211432", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.025700166821479797, "timestamp": "2025-09-30 22:10:43.218187", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:43.254092", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.024236468598246574, "timestamp": "2025-09-30 22:10:43.255983", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.286476", "step": 1246, "epoch": 2 }, { "type": "loss", "content": 0.030260343104600906, "timestamp": "2025-09-30 22:10:43.288651", "step": 1247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:43.320263", "step": 1247, "epoch": 2 }, { "type": "loss", "content": 0.010783525183796883, "timestamp": "2025-09-30 22:10:43.343949", "step": 1248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.374180", "step": 1248, "epoch": 2 }, { "type": "loss", "content": 0.02348669432103634, "timestamp": "2025-09-30 22:10:43.376269", "step": 1249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:43.406746", "step": 1249, "epoch": 2 }, { "type": "loss", "content": 0.029539231210947037, "timestamp": "2025-09-30 22:10:43.408829", "step": 1250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.445946", "step": 1250, "epoch": 2 }, { "type": "loss", "content": 0.022312434390187263, "timestamp": "2025-09-30 22:10:43.449359", "step": 1251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:43.481052", "step": 1251, "epoch": 2 }, { "type": "loss", "content": 0.009221461601555347, "timestamp": "2025-09-30 22:10:43.504805", "step": 1252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:43.535884", "step": 1252, "epoch": 2 }, { "type": "loss", "content": 0.0020505916327238083, "timestamp": "2025-09-30 22:10:43.538140", "step": 1253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:43.570161", "step": 1253, "epoch": 2 }, { "type": "loss", "content": 0.008239880204200745, "timestamp": "2025-09-30 22:10:43.572483", "step": 1254, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:44.406119", "step": 1254, "epoch": 2 }, { "type": "pplx", "content": 60760063.312181026, "timestamp": "2025-09-30 22:10:44.408997", "step": 1254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.438030", "step": 1254, "epoch": 2 }, { "type": "loss", "content": 0.005631845910102129, "timestamp": "2025-09-30 22:10:44.441744", "step": 1255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.473214", "step": 1255, "epoch": 2 }, { "type": "loss", "content": 0.022505655884742737, "timestamp": "2025-09-30 22:10:44.501650", "step": 1256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.533046", "step": 1256, "epoch": 2 }, { "type": "loss", "content": 0.01569953002035618, "timestamp": "2025-09-30 22:10:44.535803", "step": 1257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:44.568670", "step": 1257, "epoch": 2 }, { "type": "loss", "content": 0.023941362276673317, "timestamp": "2025-09-30 22:10:44.573306", "step": 1258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.605137", "step": 1258, "epoch": 2 }, { "type": "loss", "content": 0.007991237565875053, "timestamp": "2025-09-30 22:10:44.607342", "step": 1259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.641178", "step": 1259, "epoch": 2 }, { "type": "loss", "content": 0.01361407432705164, "timestamp": "2025-09-30 22:10:44.665513", "step": 1260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.709863", "step": 1260, "epoch": 2 }, { "type": "loss", "content": 0.05445224419236183, "timestamp": "2025-09-30 22:10:44.712354", "step": 1261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.744995", "step": 1261, "epoch": 2 }, { "type": "loss", "content": 0.010495486669242382, "timestamp": "2025-09-30 22:10:44.747463", "step": 1262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.781899", "step": 1262, "epoch": 2 }, { "type": "loss", "content": 0.018684295937418938, "timestamp": "2025-09-30 22:10:44.784744", "step": 1263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.819523", "step": 1263, "epoch": 2 }, { "type": "loss", "content": 0.016452034935355186, "timestamp": "2025-09-30 22:10:44.843542", "step": 1264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.877758", "step": 1264, "epoch": 2 }, { "type": "loss", "content": 0.0076084258034825325, "timestamp": "2025-09-30 22:10:44.882246", "step": 1265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:44.916878", "step": 1265, "epoch": 2 }, { "type": "loss", "content": 0.012740017846226692, "timestamp": "2025-09-30 22:10:44.920311", "step": 1266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.956443", "step": 1266, "epoch": 2 }, { "type": "loss", "content": 0.012665815651416779, "timestamp": "2025-09-30 22:10:44.960314", "step": 1267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:44.996538", "step": 1267, "epoch": 2 }, { "type": "loss", "content": 0.024495363235473633, "timestamp": "2025-09-30 22:10:45.021516", "step": 1268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.060223", "step": 1268, "epoch": 2 }, { "type": "loss", "content": 0.012718215584754944, "timestamp": "2025-09-30 22:10:45.067857", "step": 1269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.102912", "step": 1269, "epoch": 2 }, { "type": "loss", "content": 0.021051540970802307, "timestamp": "2025-09-30 22:10:45.106336", "step": 1270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:45.140191", "step": 1270, "epoch": 2 }, { "type": "loss", "content": 0.007944755256175995, "timestamp": "2025-09-30 22:10:45.148093", "step": 1271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.182766", "step": 1271, "epoch": 2 }, { "type": "loss", "content": 0.016895083710551262, "timestamp": "2025-09-30 22:10:45.206638", "step": 1272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.244320", "step": 1272, "epoch": 2 }, { "type": "loss", "content": 0.008962833322584629, "timestamp": "2025-09-30 22:10:45.249275", "step": 1273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:45.281906", "step": 1273, "epoch": 2 }, { "type": "loss", "content": 0.019458699971437454, "timestamp": "2025-09-30 22:10:45.284724", "step": 1274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.317936", "step": 1274, "epoch": 2 }, { "type": "loss", "content": 0.00804336927831173, "timestamp": "2025-09-30 22:10:45.320649", "step": 1275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.358060", "step": 1275, "epoch": 2 }, { "type": "loss", "content": 0.005566149018704891, "timestamp": "2025-09-30 22:10:45.386145", "step": 1276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:45.423307", "step": 1276, "epoch": 2 }, { "type": "loss", "content": 0.017086612060666084, "timestamp": "2025-09-30 22:10:45.426371", "step": 1277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:45.459093", "step": 1277, "epoch": 2 }, { "type": "loss", "content": 0.012655085884034634, "timestamp": "2025-09-30 22:10:45.461910", "step": 1278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.497560", "step": 1278, "epoch": 2 }, { "type": "loss", "content": 0.03651884198188782, "timestamp": "2025-09-30 22:10:45.500756", "step": 1279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:45.533443", "step": 1279, "epoch": 2 }, { "type": "loss", "content": 0.025326358154416084, "timestamp": "2025-09-30 22:10:45.557557", "step": 1280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.590639", "step": 1280, "epoch": 2 }, { "type": "loss", "content": 0.011267936788499355, "timestamp": "2025-09-30 22:10:45.597551", "step": 1281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.639706", "step": 1281, "epoch": 2 }, { "type": "loss", "content": 0.013515127822756767, "timestamp": "2025-09-30 22:10:45.642723", "step": 1282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:45.675781", "step": 1282, "epoch": 2 }, { "type": "loss", "content": 0.009024390950798988, "timestamp": "2025-09-30 22:10:45.677851", "step": 1283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:45.710782", "step": 1283, "epoch": 2 }, { "type": "loss", "content": 0.007083322387188673, "timestamp": "2025-09-30 22:10:45.737351", "step": 1284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.769479", "step": 1284, "epoch": 2 }, { "type": "loss", "content": 0.023868506774306297, "timestamp": "2025-09-30 22:10:45.772424", "step": 1285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.822934", "step": 1285, "epoch": 2 }, { "type": "loss", "content": 0.004573192447423935, "timestamp": "2025-09-30 22:10:45.831560", "step": 1286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:45.867270", "step": 1286, "epoch": 2 }, { "type": "loss", "content": 0.011483915150165558, "timestamp": "2025-09-30 22:10:45.874612", "step": 1287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.913935", "step": 1287, "epoch": 2 }, { "type": "loss", "content": 0.0037527200765907764, "timestamp": "2025-09-30 22:10:45.938270", "step": 1288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:45.974913", "step": 1288, "epoch": 2 }, { "type": "loss", "content": 0.030221473425626755, "timestamp": "2025-09-30 22:10:45.980152", "step": 1289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:46.016618", "step": 1289, "epoch": 2 }, { "type": "loss", "content": 0.010256985202431679, "timestamp": "2025-09-30 22:10:46.021314", "step": 1290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.077989", "step": 1290, "epoch": 2 }, { "type": "loss", "content": 0.015660548582673073, "timestamp": "2025-09-30 22:10:46.084527", "step": 1291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.128720", "step": 1291, "epoch": 2 }, { "type": "loss", "content": 0.015375054441392422, "timestamp": "2025-09-30 22:10:46.158159", "step": 1292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.203007", "step": 1292, "epoch": 2 }, { "type": "loss", "content": 0.003718825289979577, "timestamp": "2025-09-30 22:10:46.213348", "step": 1293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.253664", "step": 1293, "epoch": 2 }, { "type": "loss", "content": 0.0035421750508248806, "timestamp": "2025-09-30 22:10:46.258505", "step": 1294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.294634", "step": 1294, "epoch": 2 }, { "type": "loss", "content": 0.006106370594352484, "timestamp": "2025-09-30 22:10:46.310314", "step": 1295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.368561", "step": 1295, "epoch": 2 }, { "type": "loss", "content": 0.02283380925655365, "timestamp": "2025-09-30 22:10:46.398307", "step": 1296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:46.440175", "step": 1296, "epoch": 2 }, { "type": "loss", "content": 0.014118282124400139, "timestamp": "2025-09-30 22:10:46.442889", "step": 1297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.475786", "step": 1297, "epoch": 2 }, { "type": "loss", "content": 0.02216426469385624, "timestamp": "2025-09-30 22:10:46.485250", "step": 1298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:46.535659", "step": 1298, "epoch": 2 }, { "type": "loss", "content": 0.018284201622009277, "timestamp": "2025-09-30 22:10:46.539908", "step": 1299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.596624", "step": 1299, "epoch": 2 }, { "type": "loss", "content": 0.012446001172065735, "timestamp": "2025-09-30 22:10:46.622156", "step": 1300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:46.660980", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.009262526407837868, "timestamp": "2025-09-30 22:10:46.672054", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:46.710138", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.006927129812538624, "timestamp": "2025-09-30 22:10:46.713504", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.749162", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.012467254884541035, "timestamp": "2025-09-30 22:10:46.759740", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.802645", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.0042889113537967205, "timestamp": "2025-09-30 22:10:46.829066", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.865477", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.009332495741546154, "timestamp": "2025-09-30 22:10:46.868527", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:46.913940", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.0059282719157636166, "timestamp": "2025-09-30 22:10:46.919164", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:46.956720", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.014585415832698345, "timestamp": "2025-09-30 22:10:46.959773", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:47.003962", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.013048151507973671, "timestamp": "2025-09-30 22:10:47.027621", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:47.065208", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.005672777537256479, "timestamp": "2025-09-30 22:10:47.074406", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:47.122705", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.017160657793283463, "timestamp": "2025-09-30 22:10:47.125949", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:47.167950", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.023994240909814835, "timestamp": "2025-09-30 22:10:47.173891", "step": 1311, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:48.202623", "step": 1311, "epoch": 2 }, { "type": "pplx", "content": 66824242.172381386, "timestamp": "2025-09-30 22:10:48.207345", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.242280", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.0038081335369497538, "timestamp": "2025-09-30 22:10:48.267180", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.302871", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.004983414430171251, "timestamp": "2025-09-30 22:10:48.305432", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.343167", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.0009271339513361454, "timestamp": "2025-09-30 22:10:48.346308", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:48.382985", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.014029703103005886, "timestamp": "2025-09-30 22:10:48.386537", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.422652", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.005625714082270861, "timestamp": "2025-09-30 22:10:48.455947", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.491080", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.010734512470662594, "timestamp": "2025-09-30 22:10:48.494651", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.536411", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.0022723597940057516, "timestamp": "2025-09-30 22:10:48.540395", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:48.575420", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.018299564719200134, "timestamp": "2025-09-30 22:10:48.583003", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:48.620587", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.005241929553449154, "timestamp": "2025-09-30 22:10:48.646522", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.679720", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.04034224525094032, "timestamp": "2025-09-30 22:10:48.689463", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:48.737257", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.02214857004582882, "timestamp": "2025-09-30 22:10:48.740982", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:48.778418", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.008105490356683731, "timestamp": "2025-09-30 22:10:48.781765", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.822059", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.005707655567675829, "timestamp": "2025-09-30 22:10:48.848712", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:48.886114", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.0042085289023816586, "timestamp": "2025-09-30 22:10:48.891192", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.929306", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.0041105556301772594, "timestamp": "2025-09-30 22:10:48.933573", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:48.968358", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.003951088059693575, "timestamp": "2025-09-30 22:10:48.971861", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:49.012993", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.009346050210297108, "timestamp": "2025-09-30 22:10:49.036961", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:49.072001", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.01684584841132164, "timestamp": "2025-09-30 22:10:49.081270", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.123029", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.004063504748046398, "timestamp": "2025-09-30 22:10:49.127669", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.172245", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.04242516681551933, "timestamp": "2025-09-30 22:10:49.176195", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.217937", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.012350103817880154, "timestamp": "2025-09-30 22:10:49.242947", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.283401", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.012545420788228512, "timestamp": "2025-09-30 22:10:49.286125", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.334088", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.01438872516155243, "timestamp": "2025-09-30 22:10:49.338025", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.384735", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.028075823560357094, "timestamp": "2025-09-30 22:10:49.393902", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.431481", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.003635299624875188, "timestamp": "2025-09-30 22:10:49.456582", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:49.490976", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.008153866045176983, "timestamp": "2025-09-30 22:10:49.494167", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.532889", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.015067125670611858, "timestamp": "2025-09-30 22:10:49.536980", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:49.570021", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.0033039043191820383, "timestamp": "2025-09-30 22:10:49.573742", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:49.608200", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.012563040480017662, "timestamp": "2025-09-30 22:10:49.633472", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:49.675718", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.01576489582657814, "timestamp": "2025-09-30 22:10:49.678356", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:49.733087", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.009753124788403511, "timestamp": "2025-09-30 22:10:49.740931", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:49.774846", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.007622593082487583, "timestamp": "2025-09-30 22:10:49.777507", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.832110", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.01353259664028883, "timestamp": "2025-09-30 22:10:49.856790", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.899581", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.0013351899106055498, "timestamp": "2025-09-30 22:10:49.901858", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:49.938395", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.01693071238696575, "timestamp": "2025-09-30 22:10:49.946484", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:49.979673", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.013451658189296722, "timestamp": "2025-09-30 22:10:49.986298", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.024003", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.001634464249946177, "timestamp": "2025-09-30 22:10:50.048966", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:50.084320", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.0013204816495999694, "timestamp": "2025-09-30 22:10:50.087187", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.123799", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.00696495920419693, "timestamp": "2025-09-30 22:10:50.131005", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:50.167568", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.007256153970956802, "timestamp": "2025-09-30 22:10:50.176238", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.210832", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.0019330900395289063, "timestamp": "2025-09-30 22:10:50.242473", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.279094", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.004168027546256781, "timestamp": "2025-09-30 22:10:50.281649", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.328346", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.02218509279191494, "timestamp": "2025-09-30 22:10:50.330895", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.371467", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.01591980643570423, "timestamp": "2025-09-30 22:10:50.374148", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.416321", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.03547831252217293, "timestamp": "2025-09-30 22:10:50.440145", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.490194", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.009803591296076775, "timestamp": "2025-09-30 22:10:50.493087", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.537096", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.005764176603406668, "timestamp": "2025-09-30 22:10:50.547221", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.592621", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.02682340517640114, "timestamp": "2025-09-30 22:10:50.597519", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:50.634731", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.002818641485646367, "timestamp": "2025-09-30 22:10:50.659155", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:50.694661", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.036229345947504044, "timestamp": "2025-09-30 22:10:50.697825", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.732111", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.014132755808532238, "timestamp": "2025-09-30 22:10:50.734400", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.768505", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.003775624791160226, "timestamp": "2025-09-30 22:10:50.771889", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:50.808951", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.002429934684187174, "timestamp": "2025-09-30 22:10:50.835118", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.868781", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.016024207696318626, "timestamp": "2025-09-30 22:10:50.871955", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:50.918122", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.005808067973703146, "timestamp": "2025-09-30 22:10:50.921867", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.954084", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.009768741205334663, "timestamp": "2025-09-30 22:10:50.956200", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:50.988868", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.00779004255309701, "timestamp": "2025-09-30 22:10:51.015256", "step": 1368, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:51.950880", "step": 1368, "epoch": 2 }, { "type": "pplx", "content": 68275654.32620457, "timestamp": "2025-09-30 22:10:51.956112", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:51.987256", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.0018717555794864893, "timestamp": "2025-09-30 22:10:51.991273", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.027133", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.0035747247748076916, "timestamp": "2025-09-30 22:10:52.031292", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:52.083305", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.0063920291140675545, "timestamp": "2025-09-30 22:10:52.086659", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:52.120613", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.005606912542134523, "timestamp": "2025-09-30 22:10:52.146591", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.179528", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.008563164621591568, "timestamp": "2025-09-30 22:10:52.187159", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.222249", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.01809045672416687, "timestamp": "2025-09-30 22:10:52.224578", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:52.272221", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.02882465533912182, "timestamp": "2025-09-30 22:10:52.280493", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.314708", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.006099649704992771, "timestamp": "2025-09-30 22:10:52.340026", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:52.377763", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.020366067066788673, "timestamp": "2025-09-30 22:10:52.381414", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.420571", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.011057457886636257, "timestamp": "2025-09-30 22:10:52.423638", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.462322", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.020656252279877663, "timestamp": "2025-09-30 22:10:52.464958", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.499500", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.002947145840153098, "timestamp": "2025-09-30 22:10:52.524874", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.563038", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.027359262108802795, "timestamp": "2025-09-30 22:10:52.565331", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.599782", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.014876100234687328, "timestamp": "2025-09-30 22:10:52.603080", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.635654", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.008522089570760727, "timestamp": "2025-09-30 22:10:52.638261", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.672741", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.0012803823919966817, "timestamp": "2025-09-30 22:10:52.697085", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.732716", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.01757989265024662, "timestamp": "2025-09-30 22:10:52.736475", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.774988", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.025001874193549156, "timestamp": "2025-09-30 22:10:52.783637", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.816538", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.031573910266160965, "timestamp": "2025-09-30 22:10:52.820073", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.853933", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.0021813209168612957, "timestamp": "2025-09-30 22:10:52.877779", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:52.920130", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.01097036711871624, "timestamp": "2025-09-30 22:10:52.926908", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:52.966004", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.00787901971489191, "timestamp": "2025-09-30 22:10:52.968824", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.012265", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.004948236979544163, "timestamp": "2025-09-30 22:10:53.015520", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.048414", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.003085443750023842, "timestamp": "2025-09-30 22:10:53.073497", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.109575", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.00811847299337387, "timestamp": "2025-09-30 22:10:53.117885", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.158134", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.016569631174206734, "timestamp": "2025-09-30 22:10:53.162009", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.204349", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.004678026307374239, "timestamp": "2025-09-30 22:10:53.213866", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.260702", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.007153411395847797, "timestamp": "2025-09-30 22:10:53.289648", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.324476", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.012572051957249641, "timestamp": "2025-09-30 22:10:53.327164", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.361500", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.05144510790705681, "timestamp": "2025-09-30 22:10:53.364437", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.402787", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.006638978607952595, "timestamp": "2025-09-30 22:10:53.405669", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.441726", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.00618391390889883, "timestamp": "2025-09-30 22:10:53.465970", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.497937", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.012263186275959015, "timestamp": "2025-09-30 22:10:53.501353", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.543767", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.011834419332444668, "timestamp": "2025-09-30 22:10:53.546308", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.580114", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.008952699601650238, "timestamp": "2025-09-30 22:10:53.582584", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:53.621201", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.017776764929294586, "timestamp": "2025-09-30 22:10:53.650322", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.696311", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.01339375227689743, "timestamp": "2025-09-30 22:10:53.702887", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.738280", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.02056540735065937, "timestamp": "2025-09-30 22:10:53.742047", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.777952", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.0029556031804531813, "timestamp": "2025-09-30 22:10:53.788392", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.823234", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.022973816841840744, "timestamp": "2025-09-30 22:10:53.848310", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:53.892283", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.018975449725985527, "timestamp": "2025-09-30 22:10:53.895142", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.930629", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.0014696232974529266, "timestamp": "2025-09-30 22:10:53.942499", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:53.982401", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.013965344987809658, "timestamp": "2025-09-30 22:10:53.985202", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.018772", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.003091490129008889, "timestamp": "2025-09-30 22:10:54.043665", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.077265", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.003465210786089301, "timestamp": "2025-09-30 22:10:54.080642", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:54.114294", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.0012734970077872276, "timestamp": "2025-09-30 22:10:54.117913", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:54.155608", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.014250697568058968, "timestamp": "2025-09-30 22:10:54.158252", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.191240", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.007287542801350355, "timestamp": "2025-09-30 22:10:54.215670", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.250943", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.0006267238059081137, "timestamp": "2025-09-30 22:10:54.255574", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.306179", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.0023221937008202076, "timestamp": "2025-09-30 22:10:54.314373", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:54.351413", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.003638636786490679, "timestamp": "2025-09-30 22:10:54.355848", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.393505", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.006300566252321005, "timestamp": "2025-09-30 22:10:54.431529", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.471257", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.008367580361664295, "timestamp": "2025-09-30 22:10:54.477559", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.525967", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.0011981696588918567, "timestamp": "2025-09-30 22:10:54.532343", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:54.570463", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.04026754945516586, "timestamp": "2025-09-30 22:10:54.574047", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:54.625052", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.019819127395749092, "timestamp": "2025-09-30 22:10:54.651277", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:54.695388", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.02602306380867958, "timestamp": "2025-09-30 22:10:54.703042", "step": 1425, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:55.675171", "step": 1425, "epoch": 2 }, { "type": "pplx", "content": 74128275.07486075, "timestamp": "2025-09-30 22:10:55.679938", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.713635", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.0006774549256078899, "timestamp": "2025-09-30 22:10:55.718167", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.757076", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.00207727262750268, "timestamp": "2025-09-30 22:10:55.759095", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.793027", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.03250009939074516, "timestamp": "2025-09-30 22:10:55.817911", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:55.856172", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.0018771073082461953, "timestamp": "2025-09-30 22:10:55.860216", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.902186", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.010951467789709568, "timestamp": "2025-09-30 22:10:55.915378", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.951685", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.007400455418974161, "timestamp": "2025-09-30 22:10:55.955913", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:55.990628", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.004835831932723522, "timestamp": "2025-09-30 22:10:56.015274", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.050286", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.03025592677295208, "timestamp": "2025-09-30 22:10:56.054517", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:56.095446", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.01778782345354557, "timestamp": "2025-09-30 22:10:56.099054", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.133860", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.007224774919450283, "timestamp": "2025-09-30 22:10:56.138054", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.181078", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.0025212056934833527, "timestamp": "2025-09-30 22:10:56.207729", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.241100", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.016180910170078278, "timestamp": "2025-09-30 22:10:56.243431", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.277385", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.0036104407627135515, "timestamp": "2025-09-30 22:10:56.280171", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.317387", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.0056266081519424915, "timestamp": "2025-09-30 22:10:56.320965", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.363989", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.001078564440831542, "timestamp": "2025-09-30 22:10:56.389753", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.430915", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.002247942378744483, "timestamp": "2025-09-30 22:10:56.434260", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.468405", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.0018794239731505513, "timestamp": "2025-09-30 22:10:56.471789", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.509237", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.011743737384676933, "timestamp": "2025-09-30 22:10:56.512522", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.567355", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.001445922302082181, "timestamp": "2025-09-30 22:10:56.591852", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.636602", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.0063680075109004974, "timestamp": "2025-09-30 22:10:56.640191", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.673782", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.005665498320013285, "timestamp": "2025-09-30 22:10:56.676828", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.717952", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.005664634983986616, "timestamp": "2025-09-30 22:10:56.721479", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.762062", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.006698861718177795, "timestamp": "2025-09-30 22:10:56.800454", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.838238", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.018983641639351845, "timestamp": "2025-09-30 22:10:56.843558", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:56.896188", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.0048040165565907955, "timestamp": "2025-09-30 22:10:56.899361", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:56.949274", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.012130377814173698, "timestamp": "2025-09-30 22:10:56.952827", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.004081", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.0032708682119846344, "timestamp": "2025-09-30 22:10:57.030876", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.074324", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.0012729617301374674, "timestamp": "2025-09-30 22:10:57.079911", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.116884", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.010357841849327087, "timestamp": "2025-09-30 22:10:57.121268", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.158876", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.017564745619893074, "timestamp": "2025-09-30 22:10:57.166280", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.203792", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.018127692863345146, "timestamp": "2025-09-30 22:10:57.229463", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.274613", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.004254224244505167, "timestamp": "2025-09-30 22:10:57.277492", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.325013", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.011064024642109871, "timestamp": "2025-09-30 22:10:57.327995", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.364112", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.016593145206570625, "timestamp": "2025-09-30 22:10:57.374390", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.425112", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.0010914544109255075, "timestamp": "2025-09-30 22:10:57.458658", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.502887", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.049089811742305756, "timestamp": "2025-09-30 22:10:57.507490", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.548571", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.014249107800424099, "timestamp": "2025-09-30 22:10:57.552909", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.589106", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.018402233719825745, "timestamp": "2025-09-30 22:10:57.593388", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.627742", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.023724442347884178, "timestamp": "2025-09-30 22:10:57.653649", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:57.688946", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.011793600395321846, "timestamp": "2025-09-30 22:10:57.694266", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.736332", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.025359628722071648, "timestamp": "2025-09-30 22:10:57.740096", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.775963", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.036889225244522095, "timestamp": "2025-09-30 22:10:57.780574", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.818671", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.012519911862909794, "timestamp": "2025-09-30 22:10:57.843725", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.877286", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.009341800585389137, "timestamp": "2025-09-30 22:10:57.885916", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:57.922661", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.030676299706101418, "timestamp": "2025-09-30 22:10:57.925898", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:57.960961", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.02009420283138752, "timestamp": "2025-09-30 22:10:57.966077", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:58.001792", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.010448752902448177, "timestamp": "2025-09-30 22:10:58.027635", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:58.069987", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.0042487201280891895, "timestamp": "2025-09-30 22:10:58.075419", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:58.109681", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.006884237285703421, "timestamp": "2025-09-30 22:10:58.113516", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:58.155343", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.016830969601869583, "timestamp": "2025-09-30 22:10:58.158537", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:58.194099", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.013651719316840172, "timestamp": "2025-09-30 22:10:58.230067", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:10:58.276300", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.0026586789172142744, "timestamp": "2025-09-30 22:10:58.284168", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:58.319931", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.010279414243996143, "timestamp": "2025-09-30 22:10:58.322821", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:58.356679", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.020368138328194618, "timestamp": "2025-09-30 22:10:58.360343", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:58.397661", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.008841685950756073, "timestamp": "2025-09-30 22:10:58.423122", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:58.456754", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.01653965562582016, "timestamp": "2025-09-30 22:10:58.460684", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:10:58.493653", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.013136975467205048, "timestamp": "2025-09-30 22:10:58.496941", "step": 1482, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:10:59.623423", "step": 1482, "epoch": 2 }, { "type": "pplx", "content": 73202307.14203984, "timestamp": "2025-09-30 22:10:59.628171", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.667112", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.02235250174999237, "timestamp": "2025-09-30 22:10:59.669992", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.704190", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.004239015281200409, "timestamp": "2025-09-30 22:10:59.730320", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:10:59.766635", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.027366241440176964, "timestamp": "2025-09-30 22:10:59.773771", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.811244", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.010933603160083294, "timestamp": "2025-09-30 22:10:59.822439", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.856790", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.003180326195433736, "timestamp": "2025-09-30 22:10:59.859920", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.894794", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.00700030755251646, "timestamp": "2025-09-30 22:10:59.928974", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:10:59.972007", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.010965103283524513, "timestamp": "2025-09-30 22:10:59.976528", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:00.019635", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.022053999826312065, "timestamp": "2025-09-30 22:11:00.023580", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.065304", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.019200291484594345, "timestamp": "2025-09-30 22:11:00.067944", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.102150", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.009958348236978054, "timestamp": "2025-09-30 22:11:00.127080", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.159252", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.006062771193683147, "timestamp": "2025-09-30 22:11:00.162722", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:00.205627", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.013420102186501026, "timestamp": "2025-09-30 22:11:00.209482", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:00.251997", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.03325999528169632, "timestamp": "2025-09-30 22:11:00.254501", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.289027", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.015703823417425156, "timestamp": "2025-09-30 22:11:00.314585", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.356126", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.008002055808901787, "timestamp": "2025-09-30 22:11:00.365368", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:00.408875", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.007439378648996353, "timestamp": "2025-09-30 22:11:00.412191", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:00.445336", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.014068402349948883, "timestamp": "2025-09-30 22:11:00.449638", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:00.483663", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.017416151240468025, "timestamp": "2025-09-30 22:11:00.509566", "step": 1500, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-09-30 22:11:07.564602", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:07.602746", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.009199053980410099, "timestamp": "2025-09-30 22:11:07.606406", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.649605", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.007240265142172575, "timestamp": "2025-09-30 22:11:07.659750", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.698385", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.007249581627547741, "timestamp": "2025-09-30 22:11:07.703753", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.739018", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.009428039193153381, "timestamp": "2025-09-30 22:11:07.765062", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:07.802485", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.026344623416662216, "timestamp": "2025-09-30 22:11:07.806883", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.841395", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.02475654147565365, "timestamp": "2025-09-30 22:11:07.844454", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.879920", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.0358511246740818, "timestamp": "2025-09-30 22:11:07.884344", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:11:07.924027", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.014702022075653076, "timestamp": "2025-09-30 22:11:07.953109", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:07.989471", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.006700622383505106, "timestamp": "2025-09-30 22:11:07.992665", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.030146", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.023523863404989243, "timestamp": "2025-09-30 22:11:08.034068", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:08.068480", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.0009568893583491445, "timestamp": "2025-09-30 22:11:08.071191", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.106603", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.014375868253409863, "timestamp": "2025-09-30 22:11:08.135573", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.180939", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.018592536449432373, "timestamp": "2025-09-30 22:11:08.189260", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:08.226936", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.02132156677544117, "timestamp": "2025-09-30 22:11:08.230631", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.267211", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.008908395655453205, "timestamp": "2025-09-30 22:11:08.271990", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:08.304478", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.06693150103092194, "timestamp": "2025-09-30 22:11:08.329790", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.366876", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.005975159350782633, "timestamp": "2025-09-30 22:11:08.376887", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.415450", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.02259109355509281, "timestamp": "2025-09-30 22:11:08.419032", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.458249", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.011762427166104317, "timestamp": "2025-09-30 22:11:08.461973", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.496108", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.0023320745676755905, "timestamp": "2025-09-30 22:11:08.520023", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.553310", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.011356505565345287, "timestamp": "2025-09-30 22:11:08.556226", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.599418", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.007964258082211018, "timestamp": "2025-09-30 22:11:08.603768", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:08.639326", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.009889090433716774, "timestamp": "2025-09-30 22:11:08.646238", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.677783", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.012808294966816902, "timestamp": "2025-09-30 22:11:08.703500", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:08.738573", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.018992161378264427, "timestamp": "2025-09-30 22:11:08.741983", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.775922", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.014097621664404869, "timestamp": "2025-09-30 22:11:08.779284", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:08.821142", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.014756182208657265, "timestamp": "2025-09-30 22:11:08.825175", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.859641", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.023549159988760948, "timestamp": "2025-09-30 22:11:08.888787", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.927774", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.030239734798669815, "timestamp": "2025-09-30 22:11:08.930176", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:08.963505", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.03541674092411995, "timestamp": "2025-09-30 22:11:08.966184", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.000340", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.009105728007853031, "timestamp": "2025-09-30 22:11:09.011528", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.055899", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.02031215839087963, "timestamp": "2025-09-30 22:11:09.080691", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.115100", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.01983821578323841, "timestamp": "2025-09-30 22:11:09.117831", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:09.150292", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.0378604419529438, "timestamp": "2025-09-30 22:11:09.152582", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.192938", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.02909739501774311, "timestamp": "2025-09-30 22:11:09.196176", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.229038", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.014408699236810207, "timestamp": "2025-09-30 22:11:09.252833", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:09.284511", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.0065101939253509045, "timestamp": "2025-09-30 22:11:09.286804", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:09.317880", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.03613752871751785, "timestamp": "2025-09-30 22:11:09.320205", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:09.352637", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.0036878264509141445, "timestamp": "2025-09-30 22:11:09.355902", "step": 1539, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:10.286447", "step": 1539, "epoch": 2 }, { "type": "pplx", "content": 62936644.35703027, "timestamp": "2025-09-30 22:11:10.289514", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.319946", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.011266936548054218, "timestamp": "2025-09-30 22:11:10.343940", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.379171", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.026096221059560776, "timestamp": "2025-09-30 22:11:10.382644", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.417284", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.01162233017385006, "timestamp": "2025-09-30 22:11:10.421008", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.453427", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.001982102869078517, "timestamp": "2025-09-30 22:11:10.459918", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.493221", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.0030516337137669325, "timestamp": "2025-09-30 22:11:10.517291", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.552001", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.003602204378694296, "timestamp": "2025-09-30 22:11:10.555139", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.597185", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.05414646863937378, "timestamp": "2025-09-30 22:11:10.601437", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.642372", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.04593129828572273, "timestamp": "2025-09-30 22:11:10.644436", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.678240", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.005031104665249586, "timestamp": "2025-09-30 22:11:10.703129", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:10.739465", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.001814266317524016, "timestamp": "2025-09-30 22:11:10.742847", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.781075", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.0005496439407579601, "timestamp": "2025-09-30 22:11:10.784690", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.838220", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.009547079913318157, "timestamp": "2025-09-30 22:11:10.840886", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:10.876943", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.031712211668491364, "timestamp": "2025-09-30 22:11:10.902334", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:10.951313", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.00807731319218874, "timestamp": "2025-09-30 22:11:10.954474", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:10.992470", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.00021573407866526395, "timestamp": "2025-09-30 22:11:10.995339", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.027708", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.043561168015003204, "timestamp": "2025-09-30 22:11:11.030173", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.063479", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.03427037596702576, "timestamp": "2025-09-30 22:11:11.092796", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.125340", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.0011238664155825973, "timestamp": "2025-09-30 22:11:11.133735", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.167435", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.005172597710043192, "timestamp": "2025-09-30 22:11:11.170400", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.208142", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.007295799907296896, "timestamp": "2025-09-30 22:11:11.210944", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.248023", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.0017937627853825688, "timestamp": "2025-09-30 22:11:11.273349", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.307673", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.026663122698664665, "timestamp": "2025-09-30 22:11:11.310766", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.343619", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.00295668700709939, "timestamp": "2025-09-30 22:11:11.346685", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.379192", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.01714220829308033, "timestamp": "2025-09-30 22:11:11.385761", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.423772", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.009915829636156559, "timestamp": "2025-09-30 22:11:11.447542", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.479176", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.00681846309453249, "timestamp": "2025-09-30 22:11:11.481540", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:11.512410", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.006417275872081518, "timestamp": "2025-09-30 22:11:11.514536", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:11.558353", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.0019149180734530091, "timestamp": "2025-09-30 22:11:11.560817", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.598740", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.01795736886560917, "timestamp": "2025-09-30 22:11:11.635532", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.676499", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.006271344609558582, "timestamp": "2025-09-30 22:11:11.679602", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.718807", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.011765302158892155, "timestamp": "2025-09-30 22:11:11.722056", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.758770", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.01601121947169304, "timestamp": "2025-09-30 22:11:11.762378", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.799696", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.005938879679888487, "timestamp": "2025-09-30 22:11:11.833897", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.881204", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.0033138389699161053, "timestamp": "2025-09-30 22:11:11.884875", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:11.922584", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.0007845753571018577, "timestamp": "2025-09-30 22:11:11.925965", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.962022", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.012685197405517101, "timestamp": "2025-09-30 22:11:11.965371", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:11.998559", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.002849193289875984, "timestamp": "2025-09-30 22:11:12.023634", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.058480", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.008535566739737988, "timestamp": "2025-09-30 22:11:12.067109", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.113632", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.019363412633538246, "timestamp": "2025-09-30 22:11:12.116527", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.156439", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.0015924114268273115, "timestamp": "2025-09-30 22:11:12.159202", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.199275", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.00825516413897276, "timestamp": "2025-09-30 22:11:12.224151", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:12.258110", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.01482932548969984, "timestamp": "2025-09-30 22:11:12.261429", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.301157", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.004535818938165903, "timestamp": "2025-09-30 22:11:12.303799", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.335762", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.007813365198671818, "timestamp": "2025-09-30 22:11:12.339349", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.372781", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.011519812047481537, "timestamp": "2025-09-30 22:11:12.397241", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.437885", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.011441444046795368, "timestamp": "2025-09-30 22:11:12.446962", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.484675", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.02742001973092556, "timestamp": "2025-09-30 22:11:12.488882", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:12.535036", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.04134129732847214, "timestamp": "2025-09-30 22:11:12.538544", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.586707", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.0030712636653333902, "timestamp": "2025-09-30 22:11:12.611174", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.655183", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.009427117183804512, "timestamp": "2025-09-30 22:11:12.669240", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.711615", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.020526956766843796, "timestamp": "2025-09-30 22:11:12.714242", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.751598", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.03806446120142937, "timestamp": "2025-09-30 22:11:12.755662", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.793941", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.060422275215387344, "timestamp": "2025-09-30 22:11:12.817908", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.861960", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.008056329563260078, "timestamp": "2025-09-30 22:11:12.865140", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:12.912359", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.007798563688993454, "timestamp": "2025-09-30 22:11:12.916058", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:12.960644", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.012549254111945629, "timestamp": "2025-09-30 22:11:12.963167", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:13.006227", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.019497841596603394, "timestamp": "2025-09-30 22:11:13.030477", "step": 1596, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:14.051495", "step": 1596, "epoch": 2 }, { "type": "pplx", "content": 47136353.19913118, "timestamp": "2025-09-30 22:11:14.055979", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.085866", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.006792944855988026, "timestamp": "2025-09-30 22:11:14.094562", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:14.127910", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.007558739744126797, "timestamp": "2025-09-30 22:11:14.131625", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:14.168968", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.01721787638962269, "timestamp": "2025-09-30 22:11:14.172328", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.225494", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.009689075872302055, "timestamp": "2025-09-30 22:11:14.251163", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.285090", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.0035173031501471996, "timestamp": "2025-09-30 22:11:14.292633", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.327312", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.026356985792517662, "timestamp": "2025-09-30 22:11:14.330429", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.368250", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.018740003928542137, "timestamp": "2025-09-30 22:11:14.371954", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.408822", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.001994580263271928, "timestamp": "2025-09-30 22:11:14.445054", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.480880", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.005098961293697357, "timestamp": "2025-09-30 22:11:14.483469", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.525716", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.032878194004297256, "timestamp": "2025-09-30 22:11:14.528116", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.562398", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.007509057410061359, "timestamp": "2025-09-30 22:11:14.565653", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.598373", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.006099659949541092, "timestamp": "2025-09-30 22:11:14.622445", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.657197", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.007134649902582169, "timestamp": "2025-09-30 22:11:14.659747", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.701744", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.02740248665213585, "timestamp": "2025-09-30 22:11:14.705235", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:14.742201", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.037114981561899185, "timestamp": "2025-09-30 22:11:14.745149", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.795904", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.004330487456172705, "timestamp": "2025-09-30 22:11:14.826921", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:14.860475", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.014859907329082489, "timestamp": "2025-09-30 22:11:14.863817", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.897312", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.006890242453664541, "timestamp": "2025-09-30 22:11:14.900374", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.934109", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.004319236613810062, "timestamp": "2025-09-30 22:11:14.936652", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:14.973253", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.020764382556080818, "timestamp": "2025-09-30 22:11:14.999435", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:15.032944", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.017804834991693497, "timestamp": "2025-09-30 22:11:15.035453", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:15.069228", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.003709944197908044, "timestamp": "2025-09-30 22:11:15.076923", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.116441", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.010521622374653816, "timestamp": "2025-09-30 22:11:15.122817", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.157877", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.01603495515882969, "timestamp": "2025-09-30 22:11:15.182859", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:15.232127", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.0070645250380039215, "timestamp": "2025-09-30 22:11:15.236730", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:15.271333", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.009963012300431728, "timestamp": "2025-09-30 22:11:15.274460", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:15.308615", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.001885375240817666, "timestamp": "2025-09-30 22:11:15.312665", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.358125", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.0027446283493191004, "timestamp": "2025-09-30 22:11:15.382140", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.426866", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.010737799108028412, "timestamp": "2025-09-30 22:11:15.433567", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.465717", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.0061384267173707485, "timestamp": "2025-09-30 22:11:15.468549", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.506781", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.04606388881802559, "timestamp": "2025-09-30 22:11:15.511864", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:15.547568", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.007767940405756235, "timestamp": "2025-09-30 22:11:15.583697", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.617768", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.017403727397322655, "timestamp": "2025-09-30 22:11:15.622463", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.658000", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.008258694782853127, "timestamp": "2025-09-30 22:11:15.662312", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.697829", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.008111465722322464, "timestamp": "2025-09-30 22:11:15.701658", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:15.741527", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.03149692341685295, "timestamp": "2025-09-30 22:11:15.765403", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.799399", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.01508941687643528, "timestamp": "2025-09-30 22:11:15.802435", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.843164", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.007195001933723688, "timestamp": "2025-09-30 22:11:15.852408", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.887132", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.009918026626110077, "timestamp": "2025-09-30 22:11:15.890909", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:15.925086", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.00782895926386118, "timestamp": "2025-09-30 22:11:15.949569", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:15.994565", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.027636080980300903, "timestamp": "2025-09-30 22:11:16.001397", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.036006", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.0011494786012917757, "timestamp": "2025-09-30 22:11:16.045307", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.083671", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.0017236818093806505, "timestamp": "2025-09-30 22:11:16.090881", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.122671", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.0035358848981559277, "timestamp": "2025-09-30 22:11:16.150610", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.201286", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.023775506764650345, "timestamp": "2025-09-30 22:11:16.205223", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.244562", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.025196192786097527, "timestamp": "2025-09-30 22:11:16.253204", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:16.288390", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.006762804929167032, "timestamp": "2025-09-30 22:11:16.291240", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:16.324656", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.011221730150282383, "timestamp": "2025-09-30 22:11:16.348986", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.386936", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.004417662974447012, "timestamp": "2025-09-30 22:11:16.389510", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.429199", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.005136736668646336, "timestamp": "2025-09-30 22:11:16.433166", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.475557", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.010737181641161442, "timestamp": "2025-09-30 22:11:16.478448", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.529688", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.007168728858232498, "timestamp": "2025-09-30 22:11:16.554022", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.588547", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.02043146826326847, "timestamp": "2025-09-30 22:11:16.595036", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:16.629907", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.009257582947611809, "timestamp": "2025-09-30 22:11:16.632516", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:16.669616", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.004939699079841375, "timestamp": "2025-09-30 22:11:16.675479", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:16.708352", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.0030526297632604837, "timestamp": "2025-09-30 22:11:16.732305", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:16.768422", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.01626562885940075, "timestamp": "2025-09-30 22:11:16.779295", "step": 1653, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:17.777878", "step": 1653, "epoch": 2 }, { "type": "pplx", "content": 52579143.438625045, "timestamp": "2025-09-30 22:11:17.783346", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:17.815788", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.011779659427702427, "timestamp": "2025-09-30 22:11:17.820263", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:17.853790", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.005387573968619108, "timestamp": "2025-09-30 22:11:17.858850", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:17.898917", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.005358588416129351, "timestamp": "2025-09-30 22:11:17.923976", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:17.966689", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.025809431448578835, "timestamp": "2025-09-30 22:11:17.971246", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:18.007610", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.02251555025577545, "timestamp": "2025-09-30 22:11:18.011159", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.054050", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.0064919935539364815, "timestamp": "2025-09-30 22:11:18.064814", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.104791", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.004880402237176895, "timestamp": "2025-09-30 22:11:18.129394", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:18.173429", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.03059115633368492, "timestamp": "2025-09-30 22:11:18.175741", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.209942", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.01182292215526104, "timestamp": "2025-09-30 22:11:18.213649", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.259001", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.007961846888065338, "timestamp": "2025-09-30 22:11:18.266025", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.305631", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.0042456877417862415, "timestamp": "2025-09-30 22:11:18.330757", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.363205", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.011246146634221077, "timestamp": "2025-09-30 22:11:18.365745", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.410565", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.010035556741058826, "timestamp": "2025-09-30 22:11:18.413063", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.446348", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.015372427180409431, "timestamp": "2025-09-30 22:11:18.449732", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.490277", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.011080428957939148, "timestamp": "2025-09-30 22:11:18.514457", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.555951", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.03798171877861023, "timestamp": "2025-09-30 22:11:18.560229", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.592950", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.008265224285423756, "timestamp": "2025-09-30 22:11:18.598976", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:18.637168", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.011475497856736183, "timestamp": "2025-09-30 22:11:18.639645", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:18.671316", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.009563402272760868, "timestamp": "2025-09-30 22:11:18.698770", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.745958", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.008106688968837261, "timestamp": "2025-09-30 22:11:18.748927", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.787684", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.004956881981343031, "timestamp": "2025-09-30 22:11:18.790426", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.833437", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.002341997344046831, "timestamp": "2025-09-30 22:11:18.840321", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:18.880124", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.0017724635545164347, "timestamp": "2025-09-30 22:11:18.910265", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.947876", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.008215676061809063, "timestamp": "2025-09-30 22:11:18.951674", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:18.990383", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.006653092801570892, "timestamp": "2025-09-30 22:11:18.993539", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:19.029880", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.016842633485794067, "timestamp": "2025-09-30 22:11:19.035171", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.079920", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.009505665861070156, "timestamp": "2025-09-30 22:11:19.107996", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.147255", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.0035089331213384867, "timestamp": "2025-09-30 22:11:19.161010", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.206903", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.005087549332529306, "timestamp": "2025-09-30 22:11:19.211265", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.261773", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.0037894020788371563, "timestamp": "2025-09-30 22:11:19.266639", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.304258", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.0036766440607607365, "timestamp": "2025-09-30 22:11:19.328797", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.371354", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.013706843368709087, "timestamp": "2025-09-30 22:11:19.374073", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.411114", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.007448071148246527, "timestamp": "2025-09-30 22:11:19.413110", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.444973", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.03570406511425972, "timestamp": "2025-09-30 22:11:19.447758", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:19.490999", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.045662470161914825, "timestamp": "2025-09-30 22:11:19.514896", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:19.553825", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.0003720544627867639, "timestamp": "2025-09-30 22:11:19.556171", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.595078", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.050418563187122345, "timestamp": "2025-09-30 22:11:19.598579", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.634837", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.000317491969326511, "timestamp": "2025-09-30 22:11:19.638883", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:19.675596", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.029729658737778664, "timestamp": "2025-09-30 22:11:19.699876", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:19.746572", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.012829835526645184, "timestamp": "2025-09-30 22:11:19.750386", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:19.794800", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.003971923608332872, "timestamp": "2025-09-30 22:11:19.799952", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.839851", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.0006053160759620368, "timestamp": "2025-09-30 22:11:19.842887", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:19.875733", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.05187639221549034, "timestamp": "2025-09-30 22:11:19.899626", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.931281", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.0017442479729652405, "timestamp": "2025-09-30 22:11:19.933872", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:19.968020", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.017671983689069748, "timestamp": "2025-09-30 22:11:19.971499", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:20.006265", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.002051006769761443, "timestamp": "2025-09-30 22:11:20.010473", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.049175", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.012735229916870594, "timestamp": "2025-09-30 22:11:20.076512", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:20.114131", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.00960957258939743, "timestamp": "2025-09-30 22:11:20.116573", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.149478", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.0022175023332238197, "timestamp": "2025-09-30 22:11:20.152937", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:20.186794", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.0005305535742081702, "timestamp": "2025-09-30 22:11:20.191085", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.241516", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.0003927931247744709, "timestamp": "2025-09-30 22:11:20.265874", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.302418", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.006027755327522755, "timestamp": "2025-09-30 22:11:20.307492", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.339271", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.017157012596726418, "timestamp": "2025-09-30 22:11:20.341891", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:20.372977", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.0005596071714535356, "timestamp": "2025-09-30 22:11:20.386134", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.418184", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.00957457721233368, "timestamp": "2025-09-30 22:11:20.442210", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.481850", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.00753258541226387, "timestamp": "2025-09-30 22:11:20.486305", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:20.542885", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.0016120340442284942, "timestamp": "2025-09-30 22:11:20.546828", "step": 1710, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:22.077267", "step": 1710, "epoch": 2 }, { "type": "pplx", "content": 60174026.64577574, "timestamp": "2025-09-30 22:11:22.080733", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.152726", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.010765468701720238, "timestamp": "2025-09-30 22:11:22.175983", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.223024", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.010604661889374256, "timestamp": "2025-09-30 22:11:22.252521", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.287267", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.004304408561438322, "timestamp": "2025-09-30 22:11:22.290026", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.327356", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.0032625349704176188, "timestamp": "2025-09-30 22:11:22.334879", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:22.369771", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.002788375597447157, "timestamp": "2025-09-30 22:11:22.373507", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.421356", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.004468221217393875, "timestamp": "2025-09-30 22:11:22.445723", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.480791", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.034291721880435944, "timestamp": "2025-09-30 22:11:22.490757", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.533177", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.00639685895293951, "timestamp": "2025-09-30 22:11:22.537647", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:22.586726", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.014624513685703278, "timestamp": "2025-09-30 22:11:22.589658", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.623660", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.019972490146756172, "timestamp": "2025-09-30 22:11:22.648502", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.683821", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.0026740862522274256, "timestamp": "2025-09-30 22:11:22.686918", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.718315", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.004609130322933197, "timestamp": "2025-09-30 22:11:22.722070", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.757462", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.0044154576025903225, "timestamp": "2025-09-30 22:11:22.761134", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.798446", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.004700851161032915, "timestamp": "2025-09-30 22:11:22.828064", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:22.861723", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.0020207969937473536, "timestamp": "2025-09-30 22:11:22.864905", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:22.902422", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.008287390694022179, "timestamp": "2025-09-30 22:11:22.905044", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:22.938181", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.011976302601397038, "timestamp": "2025-09-30 22:11:22.941404", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:22.975675", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.0026996787637472153, "timestamp": "2025-09-30 22:11:23.000534", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.035217", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.007718226406723261, "timestamp": "2025-09-30 22:11:23.046962", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.083209", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.007085180841386318, "timestamp": "2025-09-30 22:11:23.086526", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.119688", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.009826818481087685, "timestamp": "2025-09-30 22:11:23.122632", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.164476", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.0005284692742861807, "timestamp": "2025-09-30 22:11:23.190815", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.225736", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.0018098134314641356, "timestamp": "2025-09-30 22:11:23.229164", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.263673", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.0055390470661222935, "timestamp": "2025-09-30 22:11:23.267756", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.302555", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.003144989488646388, "timestamp": "2025-09-30 22:11:23.310271", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.344838", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.005034166853874922, "timestamp": "2025-09-30 22:11:23.369640", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.402031", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.010449747554957867, "timestamp": "2025-09-30 22:11:23.404838", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:23.440348", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.017017576843500137, "timestamp": "2025-09-30 22:11:23.448499", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.483875", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.00233058026060462, "timestamp": "2025-09-30 22:11:23.487180", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.519793", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.002276728395372629, "timestamp": "2025-09-30 22:11:23.548173", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:23.580676", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.0223015658557415, "timestamp": "2025-09-30 22:11:23.593857", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:23.627828", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.004606612492352724, "timestamp": "2025-09-30 22:11:23.630529", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.666316", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.009004906751215458, "timestamp": "2025-09-30 22:11:23.669336", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.710375", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.018749961629509926, "timestamp": "2025-09-30 22:11:23.734755", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:23.767285", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.0017843234818428755, "timestamp": "2025-09-30 22:11:23.769754", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.807368", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.0022551752626895905, "timestamp": "2025-09-30 22:11:23.814763", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.847784", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.0012390002375468612, "timestamp": "2025-09-30 22:11:23.851902", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.891863", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.002626475179567933, "timestamp": "2025-09-30 22:11:23.916279", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.954270", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.0008998786797747016, "timestamp": "2025-09-30 22:11:23.965730", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:23.999385", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.002064060652628541, "timestamp": "2025-09-30 22:11:24.006826", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.038989", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.014346698299050331, "timestamp": "2025-09-30 22:11:24.045224", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:24.093531", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.0009058486321009696, "timestamp": "2025-09-30 22:11:24.123954", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.157834", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.011217797175049782, "timestamp": "2025-09-30 22:11:24.161244", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.200558", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.038053300231695175, "timestamp": "2025-09-30 22:11:24.204331", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.236421", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.0037916922010481358, "timestamp": "2025-09-30 22:11:24.240045", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.272620", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.005685042589902878, "timestamp": "2025-09-30 22:11:24.297228", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:24.328506", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.0005464585847221315, "timestamp": "2025-09-30 22:11:24.331344", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.366826", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.0022288798354566097, "timestamp": "2025-09-30 22:11:24.369591", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.408028", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.0014701449545100331, "timestamp": "2025-09-30 22:11:24.411765", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.456430", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.018580257892608643, "timestamp": "2025-09-30 22:11:24.480004", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.514320", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.005124218761920929, "timestamp": "2025-09-30 22:11:24.521142", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.563696", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.0004061157815158367, "timestamp": "2025-09-30 22:11:24.569458", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.606484", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.00017968073370866477, "timestamp": "2025-09-30 22:11:24.609874", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.647850", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.008494771085679531, "timestamp": "2025-09-30 22:11:24.675944", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.713922", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.005634578876197338, "timestamp": "2025-09-30 22:11:24.716879", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:24.749668", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.0004148540028836578, "timestamp": "2025-09-30 22:11:24.754076", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:24.795159", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.000306047557387501, "timestamp": "2025-09-30 22:11:24.799871", "step": 1767, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:25.740622", "step": 1767, "epoch": 2 }, { "type": "pplx", "content": 62558342.156648576, "timestamp": "2025-09-30 22:11:25.744118", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:25.774830", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.00027061282889917493, "timestamp": "2025-09-30 22:11:25.799665", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:25.838783", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.004808730445802212, "timestamp": "2025-09-30 22:11:25.849752", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:25.882948", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.002109024440869689, "timestamp": "2025-09-30 22:11:25.885906", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:25.925785", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.0014152558287605643, "timestamp": "2025-09-30 22:11:25.935621", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:25.984313", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.00017617108824197203, "timestamp": "2025-09-30 22:11:26.013188", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.048944", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.0008881940157152712, "timestamp": "2025-09-30 22:11:26.055619", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.098786", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.005856442730873823, "timestamp": "2025-09-30 22:11:26.106763", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:26.142555", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.02501927874982357, "timestamp": "2025-09-30 22:11:26.145541", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.180813", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.0009421083959750831, "timestamp": "2025-09-30 22:11:26.210943", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.248770", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.0019056095043197274, "timestamp": "2025-09-30 22:11:26.256469", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.287290", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.0005593043169938028, "timestamp": "2025-09-30 22:11:26.315432", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:26.349521", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.0033685010857880116, "timestamp": "2025-09-30 22:11:26.356792", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.400591", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.002042163861915469, "timestamp": "2025-09-30 22:11:26.424522", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.460158", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.00037928626989014447, "timestamp": "2025-09-30 22:11:26.462324", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.494213", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.0018793040653690696, "timestamp": "2025-09-30 22:11:26.500551", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:26.554386", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.0003356645174790174, "timestamp": "2025-09-30 22:11:26.566541", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.608962", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.0002973336377181113, "timestamp": "2025-09-30 22:11:26.634295", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.672928", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.00013657430827151984, "timestamp": "2025-09-30 22:11:26.681281", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:26.722574", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.0005276420270092785, "timestamp": "2025-09-30 22:11:26.725767", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.756868", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.009822599589824677, "timestamp": "2025-09-30 22:11:26.759795", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.791563", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.008391637355089188, "timestamp": "2025-09-30 22:11:26.816052", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.853075", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.00027605355717241764, "timestamp": "2025-09-30 22:11:26.862087", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.894932", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.00037099263863638043, "timestamp": "2025-09-30 22:11:26.898216", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.931943", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.0022850236855447292, "timestamp": "2025-09-30 22:11:26.944325", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:26.979724", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.0019840167369693518, "timestamp": "2025-09-30 22:11:27.004499", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.050239", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.0010736408876255155, "timestamp": "2025-09-30 22:11:27.054191", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:27.106593", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.0003287459840066731, "timestamp": "2025-09-30 22:11:27.116116", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.156132", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.001558436080813408, "timestamp": "2025-09-30 22:11:27.159397", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.202415", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.0010486978571861982, "timestamp": "2025-09-30 22:11:27.226969", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:27.260541", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.015961362048983574, "timestamp": "2025-09-30 22:11:27.263397", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.297177", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.043664541095495224, "timestamp": "2025-09-30 22:11:27.307270", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.344094", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.008131652139127254, "timestamp": "2025-09-30 22:11:27.347524", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.385153", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.02529001235961914, "timestamp": "2025-09-30 22:11:27.410430", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:27.446667", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.00021200468472670764, "timestamp": "2025-09-30 22:11:27.459388", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.495432", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.004534170497208834, "timestamp": "2025-09-30 22:11:27.498163", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.542871", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.003273914335295558, "timestamp": "2025-09-30 22:11:27.545274", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:27.581655", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.00039792084135115147, "timestamp": "2025-09-30 22:11:27.608954", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.642253", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.006401257123798132, "timestamp": "2025-09-30 22:11:27.644862", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.698040", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.0014091616030782461, "timestamp": "2025-09-30 22:11:27.700758", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.740365", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.004255100153386593, "timestamp": "2025-09-30 22:11:27.743865", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.785686", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.005804498679935932, "timestamp": "2025-09-30 22:11:27.811782", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:27.873326", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.000618809019215405, "timestamp": "2025-09-30 22:11:27.883352", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:27.925631", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.0020599744748324156, "timestamp": "2025-09-30 22:11:27.929742", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:27.971440", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.0024426288437098265, "timestamp": "2025-09-30 22:11:27.975627", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.024342", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.0016171614406630397, "timestamp": "2025-09-30 22:11:28.050896", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.084221", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.0019683155696839094, "timestamp": "2025-09-30 22:11:28.088533", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.126055", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.0032566902227699757, "timestamp": "2025-09-30 22:11:28.128291", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:28.163275", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.010501679964363575, "timestamp": "2025-09-30 22:11:28.165758", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:28.204593", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.0007786615751683712, "timestamp": "2025-09-30 22:11:28.234642", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:28.266876", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.0010455233277752995, "timestamp": "2025-09-30 22:11:28.269027", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.315671", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.045351140201091766, "timestamp": "2025-09-30 22:11:28.319163", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.351392", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.05572187155485153, "timestamp": "2025-09-30 22:11:28.354615", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.387521", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.0026829210110008717, "timestamp": "2025-09-30 22:11:28.412413", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.452683", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.0022102470975369215, "timestamp": "2025-09-30 22:11:28.457055", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.488524", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.010895676910877228, "timestamp": "2025-09-30 22:11:28.491925", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:28.528782", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.00846769381314516, "timestamp": "2025-09-30 22:11:28.531937", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:28.569305", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.013364391401410103, "timestamp": "2025-09-30 22:11:28.593307", "step": 1824, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:29.466059", "step": 1824, "epoch": 2 }, { "type": "pplx", "content": 57843973.11111791, "timestamp": "2025-09-30 22:11:29.469482", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:29.500883", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.006056804675608873, "timestamp": "2025-09-30 22:11:29.504191", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:29.543140", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.0024564911145716906, "timestamp": "2025-09-30 22:11:29.545870", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:29.587508", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.0025678572710603476, "timestamp": "2025-09-30 22:11:29.590641", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:29.623857", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.011328652501106262, "timestamp": "2025-09-30 22:11:29.649252", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:29.684405", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.0005400671507231891, "timestamp": "2025-09-30 22:11:29.687595", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:29.726180", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.040070440620183945, "timestamp": "2025-09-30 22:11:29.737205", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:29.779941", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.0002562327135819942, "timestamp": "2025-09-30 22:11:29.782245", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:29.817311", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.004279036540538073, "timestamp": "2025-09-30 22:11:29.841039", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:29.877457", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.0022578116040676832, "timestamp": "2025-09-30 22:11:29.881248", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:29.914510", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.003075752407312393, "timestamp": "2025-09-30 22:11:29.917617", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:29.968967", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.03581417351961136, "timestamp": "2025-09-30 22:11:29.971908", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.005282", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 0.019369926303625107, "timestamp": "2025-09-30 22:11:30.032284", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.070963", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 0.007873108610510826, "timestamp": "2025-09-30 22:11:30.073418", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.104032", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 0.007872181944549084, "timestamp": "2025-09-30 22:11:30.106911", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.143721", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 0.07405262440443039, "timestamp": "2025-09-30 22:11:30.147078", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.184634", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 0.022155776619911194, "timestamp": "2025-09-30 22:11:30.208670", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:30.240560", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.012954522855579853, "timestamp": "2025-09-30 22:11:30.242675", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.276199", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.0008165457402355969, "timestamp": "2025-09-30 22:11:30.290028", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.324121", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 0.00022883024939801544, "timestamp": "2025-09-30 22:11:30.333024", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.366347", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 0.046065591275691986, "timestamp": "2025-09-30 22:11:30.389844", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.425144", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 0.01905796490609646, "timestamp": "2025-09-30 22:11:30.428573", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.462015", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 0.0009818606777116656, "timestamp": "2025-09-30 22:11:30.465512", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.497038", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.01879790425300598, "timestamp": "2025-09-30 22:11:30.498499", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.527899", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.0275017861276865, "timestamp": "2025-09-30 22:11:30.552441", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.585621", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 0.013021481223404408, "timestamp": "2025-09-30 22:11:30.591829", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.631803", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 0.014080002903938293, "timestamp": "2025-09-30 22:11:30.634092", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.671207", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.006925381254404783, "timestamp": "2025-09-30 22:11:30.674380", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.708206", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.0025633336044847965, "timestamp": "2025-09-30 22:11:30.732466", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:30.764571", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.012529529631137848, "timestamp": "2025-09-30 22:11:30.779687", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.821928", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 0.007848630659282207, "timestamp": "2025-09-30 22:11:30.824559", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:30.856888", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 0.03343925625085831, "timestamp": "2025-09-30 22:11:30.861037", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.895355", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 0.04466833919286728, "timestamp": "2025-09-30 22:11:30.933462", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:30.976785", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.008691991679370403, "timestamp": "2025-09-30 22:11:30.980724", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.024793", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 0.014497383497655392, "timestamp": "2025-09-30 22:11:31.044058", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.076221", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 0.013723026029765606, "timestamp": "2025-09-30 22:11:31.079468", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:31.113297", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 0.008932454511523247, "timestamp": "2025-09-30 22:11:31.141385", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:31.175058", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.007725898642092943, "timestamp": "2025-09-30 22:11:31.177865", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.211411", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.01678680069744587, "timestamp": "2025-09-30 22:11:31.218039", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.250127", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.0050750188529491425, "timestamp": "2025-09-30 22:11:31.252823", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.283374", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.012287462130188942, "timestamp": "2025-09-30 22:11:31.310546", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.345395", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.014692439697682858, "timestamp": "2025-09-30 22:11:31.348943", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.382906", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 0.016319891437888145, "timestamp": "2025-09-30 22:11:31.385310", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.419345", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 0.007593729067593813, "timestamp": "2025-09-30 22:11:31.421938", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.453051", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 0.015587197616696358, "timestamp": "2025-09-30 22:11:31.476657", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.517082", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 0.010518831200897694, "timestamp": "2025-09-30 22:11:31.519589", "step": 1869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.555541", "step": 1869, "epoch": 3 }, { "type": "loss", "content": 0.023400625213980675, "timestamp": "2025-09-30 22:11:31.565628", "step": 1870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.625427", "step": 1870, "epoch": 3 }, { "type": "loss", "content": 0.013268728740513325, "timestamp": "2025-09-30 22:11:31.631806", "step": 1871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.683722", "step": 1871, "epoch": 3 }, { "type": "loss", "content": 0.00882081501185894, "timestamp": "2025-09-30 22:11:31.712258", "step": 1872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:31.743884", "step": 1872, "epoch": 3 }, { "type": "loss", "content": 0.011401886120438576, "timestamp": "2025-09-30 22:11:31.746451", "step": 1873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.777996", "step": 1873, "epoch": 3 }, { "type": "loss", "content": 0.01912127062678337, "timestamp": "2025-09-30 22:11:31.784925", "step": 1874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.832238", "step": 1874, "epoch": 3 }, { "type": "loss", "content": 0.009558220393955708, "timestamp": "2025-09-30 22:11:31.844336", "step": 1875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:31.882936", "step": 1875, "epoch": 3 }, { "type": "loss", "content": 0.004013984929770231, "timestamp": "2025-09-30 22:11:31.906337", "step": 1876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:31.937528", "step": 1876, "epoch": 3 }, { "type": "loss", "content": 0.008159502409398556, "timestamp": "2025-09-30 22:11:31.940317", "step": 1877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:31.980065", "step": 1877, "epoch": 3 }, { "type": "loss", "content": 0.000791891769040376, "timestamp": "2025-09-30 22:11:31.987844", "step": 1878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:32.023641", "step": 1878, "epoch": 3 }, { "type": "loss", "content": 0.004595204256474972, "timestamp": "2025-09-30 22:11:32.028113", "step": 1879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:32.061945", "step": 1879, "epoch": 3 }, { "type": "loss", "content": 0.0016687085153535008, "timestamp": "2025-09-30 22:11:32.087917", "step": 1880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:32.124629", "step": 1880, "epoch": 3 }, { "type": "loss", "content": 0.004651383031159639, "timestamp": "2025-09-30 22:11:32.127319", "step": 1881, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:33.074469", "step": 1881, "epoch": 3 }, { "type": "pplx", "content": 58300001.57745322, "timestamp": "2025-09-30 22:11:33.086962", "step": 1881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.123593", "step": 1881, "epoch": 3 }, { "type": "loss", "content": 0.0006952568655833602, "timestamp": "2025-09-30 22:11:33.132758", "step": 1882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.164032", "step": 1882, "epoch": 3 }, { "type": "loss", "content": 0.000541742134373635, "timestamp": "2025-09-30 22:11:33.166746", "step": 1883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.209638", "step": 1883, "epoch": 3 }, { "type": "loss", "content": 0.01002055685967207, "timestamp": "2025-09-30 22:11:33.234969", "step": 1884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.265653", "step": 1884, "epoch": 3 }, { "type": "loss", "content": 0.0002052335039479658, "timestamp": "2025-09-30 22:11:33.268371", "step": 1885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.299637", "step": 1885, "epoch": 3 }, { "type": "loss", "content": 0.047041479498147964, "timestamp": "2025-09-30 22:11:33.302040", "step": 1886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.332696", "step": 1886, "epoch": 3 }, { "type": "loss", "content": 0.016662245616316795, "timestamp": "2025-09-30 22:11:33.335426", "step": 1887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:33.365138", "step": 1887, "epoch": 3 }, { "type": "loss", "content": 0.02230525016784668, "timestamp": "2025-09-30 22:11:33.391292", "step": 1888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.421717", "step": 1888, "epoch": 3 }, { "type": "loss", "content": 0.009830297902226448, "timestamp": "2025-09-30 22:11:33.423857", "step": 1889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.453551", "step": 1889, "epoch": 3 }, { "type": "loss", "content": 0.010316459462046623, "timestamp": "2025-09-30 22:11:33.456019", "step": 1890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.485785", "step": 1890, "epoch": 3 }, { "type": "loss", "content": 0.006007314659655094, "timestamp": "2025-09-30 22:11:33.487984", "step": 1891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.518765", "step": 1891, "epoch": 3 }, { "type": "loss", "content": 0.0068843550980091095, "timestamp": "2025-09-30 22:11:33.543397", "step": 1892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.588940", "step": 1892, "epoch": 3 }, { "type": "loss", "content": 0.006098180077970028, "timestamp": "2025-09-30 22:11:33.598198", "step": 1893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.636589", "step": 1893, "epoch": 3 }, { "type": "loss", "content": 0.04380368813872337, "timestamp": "2025-09-30 22:11:33.639790", "step": 1894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.677421", "step": 1894, "epoch": 3 }, { "type": "loss", "content": 0.0040383776649832726, "timestamp": "2025-09-30 22:11:33.684386", "step": 1895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.721214", "step": 1895, "epoch": 3 }, { "type": "loss", "content": 0.021266529336571693, "timestamp": "2025-09-30 22:11:33.745076", "step": 1896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.786419", "step": 1896, "epoch": 3 }, { "type": "loss", "content": 0.00014776589523535222, "timestamp": "2025-09-30 22:11:33.791990", "step": 1897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.823137", "step": 1897, "epoch": 3 }, { "type": "loss", "content": 0.0008200978627428412, "timestamp": "2025-09-30 22:11:33.828972", "step": 1898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.861415", "step": 1898, "epoch": 3 }, { "type": "loss", "content": 0.0001828258391469717, "timestamp": "2025-09-30 22:11:33.865791", "step": 1899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:33.896717", "step": 1899, "epoch": 3 }, { "type": "loss", "content": 0.05202025920152664, "timestamp": "2025-09-30 22:11:33.921215", "step": 1900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.952903", "step": 1900, "epoch": 3 }, { "type": "loss", "content": 0.0005612968816421926, "timestamp": "2025-09-30 22:11:33.957467", "step": 1901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:33.995578", "step": 1901, "epoch": 3 }, { "type": "loss", "content": 0.0028064267244189978, "timestamp": "2025-09-30 22:11:33.998196", "step": 1902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:34.041465", "step": 1902, "epoch": 3 }, { "type": "loss", "content": 0.01332185510545969, "timestamp": "2025-09-30 22:11:34.055049", "step": 1903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.093277", "step": 1903, "epoch": 3 }, { "type": "loss", "content": 0.03658471629023552, "timestamp": "2025-09-30 22:11:34.116742", "step": 1904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.147804", "step": 1904, "epoch": 3 }, { "type": "loss", "content": 0.00020756521553266793, "timestamp": "2025-09-30 22:11:34.149961", "step": 1905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.184187", "step": 1905, "epoch": 3 }, { "type": "loss", "content": 0.04848742485046387, "timestamp": "2025-09-30 22:11:34.186798", "step": 1906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:34.217337", "step": 1906, "epoch": 3 }, { "type": "loss", "content": 0.0005019632517360151, "timestamp": "2025-09-30 22:11:34.219518", "step": 1907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.255246", "step": 1907, "epoch": 3 }, { "type": "loss", "content": 0.03832434490323067, "timestamp": "2025-09-30 22:11:34.278878", "step": 1908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.308405", "step": 1908, "epoch": 3 }, { "type": "loss", "content": 0.007056743372231722, "timestamp": "2025-09-30 22:11:34.311095", "step": 1909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:34.341428", "step": 1909, "epoch": 3 }, { "type": "loss", "content": 0.005560364108532667, "timestamp": "2025-09-30 22:11:34.343587", "step": 1910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.374874", "step": 1910, "epoch": 3 }, { "type": "loss", "content": 0.007207165006548166, "timestamp": "2025-09-30 22:11:34.377181", "step": 1911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.408075", "step": 1911, "epoch": 3 }, { "type": "loss", "content": 0.002402246231213212, "timestamp": "2025-09-30 22:11:34.431295", "step": 1912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.487213", "step": 1912, "epoch": 3 }, { "type": "loss", "content": 0.0030392964836210012, "timestamp": "2025-09-30 22:11:34.489529", "step": 1913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:34.521423", "step": 1913, "epoch": 3 }, { "type": "loss", "content": 0.014510233886539936, "timestamp": "2025-09-30 22:11:34.523588", "step": 1914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.555635", "step": 1914, "epoch": 3 }, { "type": "loss", "content": 0.02432306483387947, "timestamp": "2025-09-30 22:11:34.558991", "step": 1915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.593283", "step": 1915, "epoch": 3 }, { "type": "loss", "content": 0.009666199795901775, "timestamp": "2025-09-30 22:11:34.617758", "step": 1916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.650002", "step": 1916, "epoch": 3 }, { "type": "loss", "content": 0.001959824236109853, "timestamp": "2025-09-30 22:11:34.654015", "step": 1917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.692460", "step": 1917, "epoch": 3 }, { "type": "loss", "content": 0.00908661913126707, "timestamp": "2025-09-30 22:11:34.696389", "step": 1918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.730218", "step": 1918, "epoch": 3 }, { "type": "loss", "content": 0.002566373208537698, "timestamp": "2025-09-30 22:11:34.735628", "step": 1919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.768795", "step": 1919, "epoch": 3 }, { "type": "loss", "content": 0.03425890579819679, "timestamp": "2025-09-30 22:11:34.794343", "step": 1920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.833089", "step": 1920, "epoch": 3 }, { "type": "loss", "content": 0.0045762574300169945, "timestamp": "2025-09-30 22:11:34.836814", "step": 1921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.877627", "step": 1921, "epoch": 3 }, { "type": "loss", "content": 0.003459982108324766, "timestamp": "2025-09-30 22:11:34.880080", "step": 1922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:34.915849", "step": 1922, "epoch": 3 }, { "type": "loss", "content": 0.032456815242767334, "timestamp": "2025-09-30 22:11:34.917558", "step": 1923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:34.951722", "step": 1923, "epoch": 3 }, { "type": "loss", "content": 0.017405377700924873, "timestamp": "2025-09-30 22:11:34.975704", "step": 1924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.006934", "step": 1924, "epoch": 3 }, { "type": "loss", "content": 0.02231362648308277, "timestamp": "2025-09-30 22:11:35.016028", "step": 1925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:35.052642", "step": 1925, "epoch": 3 }, { "type": "loss", "content": 0.02806389331817627, "timestamp": "2025-09-30 22:11:35.055126", "step": 1926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.087151", "step": 1926, "epoch": 3 }, { "type": "loss", "content": 0.0009463157621212304, "timestamp": "2025-09-30 22:11:35.090545", "step": 1927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.123466", "step": 1927, "epoch": 3 }, { "type": "loss", "content": 0.006710561458021402, "timestamp": "2025-09-30 22:11:35.153766", "step": 1928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.197962", "step": 1928, "epoch": 3 }, { "type": "loss", "content": 0.02229662612080574, "timestamp": "2025-09-30 22:11:35.204638", "step": 1929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.239393", "step": 1929, "epoch": 3 }, { "type": "loss", "content": 0.048850979655981064, "timestamp": "2025-09-30 22:11:35.242809", "step": 1930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.274787", "step": 1930, "epoch": 3 }, { "type": "loss", "content": 0.00973884854465723, "timestamp": "2025-09-30 22:11:35.280935", "step": 1931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.314391", "step": 1931, "epoch": 3 }, { "type": "loss", "content": 0.002838547807186842, "timestamp": "2025-09-30 22:11:35.338628", "step": 1932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.370152", "step": 1932, "epoch": 3 }, { "type": "loss", "content": 0.0015181986382231116, "timestamp": "2025-09-30 22:11:35.375264", "step": 1933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.408177", "step": 1933, "epoch": 3 }, { "type": "loss", "content": 0.007999510504305363, "timestamp": "2025-09-30 22:11:35.409933", "step": 1934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.440019", "step": 1934, "epoch": 3 }, { "type": "loss", "content": 0.0033200494945049286, "timestamp": "2025-09-30 22:11:35.442597", "step": 1935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.475465", "step": 1935, "epoch": 3 }, { "type": "loss", "content": 0.005467348266392946, "timestamp": "2025-09-30 22:11:35.499736", "step": 1936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.534777", "step": 1936, "epoch": 3 }, { "type": "loss", "content": 0.006198174320161343, "timestamp": "2025-09-30 22:11:35.540626", "step": 1937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:35.577303", "step": 1937, "epoch": 3 }, { "type": "loss", "content": 0.005689568817615509, "timestamp": "2025-09-30 22:11:35.579727", "step": 1938, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:36.596669", "step": 1938, "epoch": 3 }, { "type": "pplx", "content": 43627380.275860295, "timestamp": "2025-09-30 22:11:36.599167", "step": 1938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:36.631668", "step": 1938, "epoch": 3 }, { "type": "loss", "content": 0.041265103965997696, "timestamp": "2025-09-30 22:11:36.634148", "step": 1939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:36.664786", "step": 1939, "epoch": 3 }, { "type": "loss", "content": 0.0021771376486867666, "timestamp": "2025-09-30 22:11:36.690810", "step": 1940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:36.728941", "step": 1940, "epoch": 3 }, { "type": "loss", "content": 0.009394007734954357, "timestamp": "2025-09-30 22:11:36.733213", "step": 1941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:36.764817", "step": 1941, "epoch": 3 }, { "type": "loss", "content": 0.00999508984386921, "timestamp": "2025-09-30 22:11:36.770123", "step": 1942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:36.807388", "step": 1942, "epoch": 3 }, { "type": "loss", "content": 0.03857743740081787, "timestamp": "2025-09-30 22:11:36.812467", "step": 1943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:36.845704", "step": 1943, "epoch": 3 }, { "type": "loss", "content": 0.041628073900938034, "timestamp": "2025-09-30 22:11:36.869338", "step": 1944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:36.914712", "step": 1944, "epoch": 3 }, { "type": "loss", "content": 0.0016767786582931876, "timestamp": "2025-09-30 22:11:36.917931", "step": 1945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:36.948917", "step": 1945, "epoch": 3 }, { "type": "loss", "content": 0.012962265871465206, "timestamp": "2025-09-30 22:11:36.954441", "step": 1946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:36.993420", "step": 1946, "epoch": 3 }, { "type": "loss", "content": 0.003387217177078128, "timestamp": "2025-09-30 22:11:36.996098", "step": 1947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.047895", "step": 1947, "epoch": 3 }, { "type": "loss", "content": 0.0020589407067745924, "timestamp": "2025-09-30 22:11:37.074894", "step": 1948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.109800", "step": 1948, "epoch": 3 }, { "type": "loss", "content": 0.0035861472133547068, "timestamp": "2025-09-30 22:11:37.121958", "step": 1949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:37.155186", "step": 1949, "epoch": 3 }, { "type": "loss", "content": 0.011754592880606651, "timestamp": "2025-09-30 22:11:37.159976", "step": 1950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.198710", "step": 1950, "epoch": 3 }, { "type": "loss", "content": 0.012257657945156097, "timestamp": "2025-09-30 22:11:37.204813", "step": 1951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.246821", "step": 1951, "epoch": 3 }, { "type": "loss", "content": 0.028148228302598, "timestamp": "2025-09-30 22:11:37.273645", "step": 1952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.310597", "step": 1952, "epoch": 3 }, { "type": "loss", "content": 0.001430458389222622, "timestamp": "2025-09-30 22:11:37.313023", "step": 1953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.347053", "step": 1953, "epoch": 3 }, { "type": "loss", "content": 0.003840786637738347, "timestamp": "2025-09-30 22:11:37.349462", "step": 1954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.381022", "step": 1954, "epoch": 3 }, { "type": "loss", "content": 0.04572409391403198, "timestamp": "2025-09-30 22:11:37.383572", "step": 1955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:37.427216", "step": 1955, "epoch": 3 }, { "type": "loss", "content": 0.015760038048028946, "timestamp": "2025-09-30 22:11:37.450976", "step": 1956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:37.482989", "step": 1956, "epoch": 3 }, { "type": "loss", "content": 0.009832469746470451, "timestamp": "2025-09-30 22:11:37.486785", "step": 1957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.519959", "step": 1957, "epoch": 3 }, { "type": "loss", "content": 0.0026130271144211292, "timestamp": "2025-09-30 22:11:37.523027", "step": 1958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.559954", "step": 1958, "epoch": 3 }, { "type": "loss", "content": 0.004075376782566309, "timestamp": "2025-09-30 22:11:37.562879", "step": 1959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.600938", "step": 1959, "epoch": 3 }, { "type": "loss", "content": 0.04126408323645592, "timestamp": "2025-09-30 22:11:37.632136", "step": 1960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:37.673315", "step": 1960, "epoch": 3 }, { "type": "loss", "content": 0.0035435904283076525, "timestamp": "2025-09-30 22:11:37.675789", "step": 1961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.708499", "step": 1961, "epoch": 3 }, { "type": "loss", "content": 0.004043902270495892, "timestamp": "2025-09-30 22:11:37.710923", "step": 1962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.744656", "step": 1962, "epoch": 3 }, { "type": "loss", "content": 0.015519635751843452, "timestamp": "2025-09-30 22:11:37.748859", "step": 1963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.786265", "step": 1963, "epoch": 3 }, { "type": "loss", "content": 0.030089065432548523, "timestamp": "2025-09-30 22:11:37.810117", "step": 1964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.843879", "step": 1964, "epoch": 3 }, { "type": "loss", "content": 0.007775893900543451, "timestamp": "2025-09-30 22:11:37.847497", "step": 1965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.885079", "step": 1965, "epoch": 3 }, { "type": "loss", "content": 0.003808345878496766, "timestamp": "2025-09-30 22:11:37.890297", "step": 1966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:37.925134", "step": 1966, "epoch": 3 }, { "type": "loss", "content": 0.004614766221493483, "timestamp": "2025-09-30 22:11:37.928706", "step": 1967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:37.963176", "step": 1967, "epoch": 3 }, { "type": "loss", "content": 0.017028817906975746, "timestamp": "2025-09-30 22:11:37.987256", "step": 1968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:38.022126", "step": 1968, "epoch": 3 }, { "type": "loss", "content": 0.0027420276310294867, "timestamp": "2025-09-30 22:11:38.024815", "step": 1969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.058904", "step": 1969, "epoch": 3 }, { "type": "loss", "content": 0.0091890012845397, "timestamp": "2025-09-30 22:11:38.064282", "step": 1970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:38.095938", "step": 1970, "epoch": 3 }, { "type": "loss", "content": 0.006831838749349117, "timestamp": "2025-09-30 22:11:38.098149", "step": 1971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.137643", "step": 1971, "epoch": 3 }, { "type": "loss", "content": 0.021977385506033897, "timestamp": "2025-09-30 22:11:38.161325", "step": 1972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.201039", "step": 1972, "epoch": 3 }, { "type": "loss", "content": 0.008382284082472324, "timestamp": "2025-09-30 22:11:38.204053", "step": 1973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.240818", "step": 1973, "epoch": 3 }, { "type": "loss", "content": 0.0059802415780723095, "timestamp": "2025-09-30 22:11:38.243117", "step": 1974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.275063", "step": 1974, "epoch": 3 }, { "type": "loss", "content": 0.019793765619397163, "timestamp": "2025-09-30 22:11:38.277255", "step": 1975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.308258", "step": 1975, "epoch": 3 }, { "type": "loss", "content": 0.01100704912096262, "timestamp": "2025-09-30 22:11:38.332794", "step": 1976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.364649", "step": 1976, "epoch": 3 }, { "type": "loss", "content": 0.007676057517528534, "timestamp": "2025-09-30 22:11:38.367204", "step": 1977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.402216", "step": 1977, "epoch": 3 }, { "type": "loss", "content": 0.0077926525846123695, "timestamp": "2025-09-30 22:11:38.405555", "step": 1978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.451705", "step": 1978, "epoch": 3 }, { "type": "loss", "content": 0.021432368084788322, "timestamp": "2025-09-30 22:11:38.455731", "step": 1979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.491779", "step": 1979, "epoch": 3 }, { "type": "loss", "content": 0.003779368242248893, "timestamp": "2025-09-30 22:11:38.516552", "step": 1980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.550531", "step": 1980, "epoch": 3 }, { "type": "loss", "content": 0.006811430212110281, "timestamp": "2025-09-30 22:11:38.554924", "step": 1981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.587888", "step": 1981, "epoch": 3 }, { "type": "loss", "content": 0.008039762265980244, "timestamp": "2025-09-30 22:11:38.589999", "step": 1982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.621379", "step": 1982, "epoch": 3 }, { "type": "loss", "content": 0.010187317617237568, "timestamp": "2025-09-30 22:11:38.623660", "step": 1983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.653657", "step": 1983, "epoch": 3 }, { "type": "loss", "content": 0.00863056443631649, "timestamp": "2025-09-30 22:11:38.677326", "step": 1984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.713997", "step": 1984, "epoch": 3 }, { "type": "loss", "content": 0.004583257250487804, "timestamp": "2025-09-30 22:11:38.716903", "step": 1985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.748750", "step": 1985, "epoch": 3 }, { "type": "loss", "content": 0.00606236606836319, "timestamp": "2025-09-30 22:11:38.751793", "step": 1986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:38.781892", "step": 1986, "epoch": 3 }, { "type": "loss", "content": 0.027738112956285477, "timestamp": "2025-09-30 22:11:38.785210", "step": 1987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:38.817077", "step": 1987, "epoch": 3 }, { "type": "loss", "content": 0.01934841088950634, "timestamp": "2025-09-30 22:11:38.840780", "step": 1988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.871270", "step": 1988, "epoch": 3 }, { "type": "loss", "content": 0.0069843316450715065, "timestamp": "2025-09-30 22:11:38.874007", "step": 1989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.906349", "step": 1989, "epoch": 3 }, { "type": "loss", "content": 0.003509870497509837, "timestamp": "2025-09-30 22:11:38.908453", "step": 1990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.941136", "step": 1990, "epoch": 3 }, { "type": "loss", "content": 0.031461603939533234, "timestamp": "2025-09-30 22:11:38.943272", "step": 1991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:38.977005", "step": 1991, "epoch": 3 }, { "type": "loss", "content": 0.02272350713610649, "timestamp": "2025-09-30 22:11:39.000736", "step": 1992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:39.044288", "step": 1992, "epoch": 3 }, { "type": "loss", "content": 0.04014478996396065, "timestamp": "2025-09-30 22:11:39.046856", "step": 1993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:39.077258", "step": 1993, "epoch": 3 }, { "type": "loss", "content": 0.0069357166066765785, "timestamp": "2025-09-30 22:11:39.079878", "step": 1994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:39.119734", "step": 1994, "epoch": 3 }, { "type": "loss", "content": 0.00451026763767004, "timestamp": "2025-09-30 22:11:39.124010", "step": 1995, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:39.890481", "step": 1995, "epoch": 3 }, { "type": "pplx", "content": 45151114.894568056, "timestamp": "2025-09-30 22:11:39.892281", "step": 1995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:39.921447", "step": 1995, "epoch": 3 }, { "type": "loss", "content": 0.0034852561075240374, "timestamp": "2025-09-30 22:11:39.945905", "step": 1996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:39.980609", "step": 1996, "epoch": 3 }, { "type": "loss", "content": 0.0041796243749558926, "timestamp": "2025-09-30 22:11:39.983239", "step": 1997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:40.017253", "step": 1997, "epoch": 3 }, { "type": "loss", "content": 0.01105498243123293, "timestamp": "2025-09-30 22:11:40.019276", "step": 1998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:40.049220", "step": 1998, "epoch": 3 }, { "type": "loss", "content": 0.02129027433693409, "timestamp": "2025-09-30 22:11:40.051677", "step": 1999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:40.086422", "step": 1999, "epoch": 3 }, { "type": "loss", "content": 0.012991896830499172, "timestamp": "2025-09-30 22:11:40.110596", "step": 2000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-09-30 22:11:47.120027", "step": 2000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.171133", "step": 2000, "epoch": 3 }, { "type": "loss", "content": 0.011763842776417732, "timestamp": "2025-09-30 22:11:47.173258", "step": 2001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.205359", "step": 2001, "epoch": 3 }, { "type": "loss", "content": 0.001990505028516054, "timestamp": "2025-09-30 22:11:47.207565", "step": 2002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.237348", "step": 2002, "epoch": 3 }, { "type": "loss", "content": 0.0028346367180347443, "timestamp": "2025-09-30 22:11:47.239951", "step": 2003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.271096", "step": 2003, "epoch": 3 }, { "type": "loss", "content": 0.040164608508348465, "timestamp": "2025-09-30 22:11:47.294777", "step": 2004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.324666", "step": 2004, "epoch": 3 }, { "type": "loss", "content": 0.0015259806532412767, "timestamp": "2025-09-30 22:11:47.326542", "step": 2005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:47.356996", "step": 2005, "epoch": 3 }, { "type": "loss", "content": 0.023955607786774635, "timestamp": "2025-09-30 22:11:47.359226", "step": 2006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.389113", "step": 2006, "epoch": 3 }, { "type": "loss", "content": 0.010714337229728699, "timestamp": "2025-09-30 22:11:47.391113", "step": 2007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.420113", "step": 2007, "epoch": 3 }, { "type": "loss", "content": 0.019530069082975388, "timestamp": "2025-09-30 22:11:47.444120", "step": 2008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.478806", "step": 2008, "epoch": 3 }, { "type": "loss", "content": 0.005085115786641836, "timestamp": "2025-09-30 22:11:47.480636", "step": 2009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:47.510149", "step": 2009, "epoch": 3 }, { "type": "loss", "content": 0.0203965175896883, "timestamp": "2025-09-30 22:11:47.512273", "step": 2010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.543661", "step": 2010, "epoch": 3 }, { "type": "loss", "content": 0.004833654034882784, "timestamp": "2025-09-30 22:11:47.546287", "step": 2011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.576880", "step": 2011, "epoch": 3 }, { "type": "loss", "content": 0.0030174553394317627, "timestamp": "2025-09-30 22:11:47.600315", "step": 2012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.632537", "step": 2012, "epoch": 3 }, { "type": "loss", "content": 0.0027652904391288757, "timestamp": "2025-09-30 22:11:47.634574", "step": 2013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.665703", "step": 2013, "epoch": 3 }, { "type": "loss", "content": 0.0048994021490216255, "timestamp": "2025-09-30 22:11:47.668123", "step": 2014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.698929", "step": 2014, "epoch": 3 }, { "type": "loss", "content": 0.024076785892248154, "timestamp": "2025-09-30 22:11:47.701111", "step": 2015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.733693", "step": 2015, "epoch": 3 }, { "type": "loss", "content": 0.009501011110842228, "timestamp": "2025-09-30 22:11:47.757465", "step": 2016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.787088", "step": 2016, "epoch": 3 }, { "type": "loss", "content": 0.009324478916823864, "timestamp": "2025-09-30 22:11:47.789314", "step": 2017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:47.821263", "step": 2017, "epoch": 3 }, { "type": "loss", "content": 0.01339875441044569, "timestamp": "2025-09-30 22:11:47.823502", "step": 2018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:47.853658", "step": 2018, "epoch": 3 }, { "type": "loss", "content": 0.002396277617663145, "timestamp": "2025-09-30 22:11:47.855895", "step": 2019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.886382", "step": 2019, "epoch": 3 }, { "type": "loss", "content": 0.012006105855107307, "timestamp": "2025-09-30 22:11:47.909764", "step": 2020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.941224", "step": 2020, "epoch": 3 }, { "type": "loss", "content": 0.0036790850572288036, "timestamp": "2025-09-30 22:11:47.955276", "step": 2021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:47.986337", "step": 2021, "epoch": 3 }, { "type": "loss", "content": 0.006259873043745756, "timestamp": "2025-09-30 22:11:47.989571", "step": 2022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.019824", "step": 2022, "epoch": 3 }, { "type": "loss", "content": 0.0020233269315212965, "timestamp": "2025-09-30 22:11:48.022334", "step": 2023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.055789", "step": 2023, "epoch": 3 }, { "type": "loss", "content": 0.008467396721243858, "timestamp": "2025-09-30 22:11:48.079904", "step": 2024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.113280", "step": 2024, "epoch": 3 }, { "type": "loss", "content": 0.013029148802161217, "timestamp": "2025-09-30 22:11:48.116011", "step": 2025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:48.146751", "step": 2025, "epoch": 3 }, { "type": "loss", "content": 0.0013387370854616165, "timestamp": "2025-09-30 22:11:48.148866", "step": 2026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.180806", "step": 2026, "epoch": 3 }, { "type": "loss", "content": 0.002847356954589486, "timestamp": "2025-09-30 22:11:48.183220", "step": 2027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.214279", "step": 2027, "epoch": 3 }, { "type": "loss", "content": 0.003806897671893239, "timestamp": "2025-09-30 22:11:48.238058", "step": 2028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.269355", "step": 2028, "epoch": 3 }, { "type": "loss", "content": 0.020344218239188194, "timestamp": "2025-09-30 22:11:48.271870", "step": 2029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.302858", "step": 2029, "epoch": 3 }, { "type": "loss", "content": 0.011094714514911175, "timestamp": "2025-09-30 22:11:48.305095", "step": 2030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.336226", "step": 2030, "epoch": 3 }, { "type": "loss", "content": 0.003669965546578169, "timestamp": "2025-09-30 22:11:48.338542", "step": 2031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.369177", "step": 2031, "epoch": 3 }, { "type": "loss", "content": 0.00963125191628933, "timestamp": "2025-09-30 22:11:48.393090", "step": 2032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:48.423617", "step": 2032, "epoch": 3 }, { "type": "loss", "content": 0.00689741550013423, "timestamp": "2025-09-30 22:11:48.425619", "step": 2033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.455255", "step": 2033, "epoch": 3 }, { "type": "loss", "content": 0.010891190730035305, "timestamp": "2025-09-30 22:11:48.457542", "step": 2034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.488398", "step": 2034, "epoch": 3 }, { "type": "loss", "content": 0.003279446391388774, "timestamp": "2025-09-30 22:11:48.494058", "step": 2035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.524167", "step": 2035, "epoch": 3 }, { "type": "loss", "content": 0.01791677437722683, "timestamp": "2025-09-30 22:11:48.547641", "step": 2036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.578445", "step": 2036, "epoch": 3 }, { "type": "loss", "content": 0.008618688210844994, "timestamp": "2025-09-30 22:11:48.580755", "step": 2037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:48.614180", "step": 2037, "epoch": 3 }, { "type": "loss", "content": 0.023733243346214294, "timestamp": "2025-09-30 22:11:48.616326", "step": 2038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:48.648518", "step": 2038, "epoch": 3 }, { "type": "loss", "content": 0.0052038319408893585, "timestamp": "2025-09-30 22:11:48.651005", "step": 2039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.680617", "step": 2039, "epoch": 3 }, { "type": "loss", "content": 0.005849936511367559, "timestamp": "2025-09-30 22:11:48.704292", "step": 2040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.734190", "step": 2040, "epoch": 3 }, { "type": "loss", "content": 0.009815479628741741, "timestamp": "2025-09-30 22:11:48.736277", "step": 2041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.766915", "step": 2041, "epoch": 3 }, { "type": "loss", "content": 0.0007055861060507596, "timestamp": "2025-09-30 22:11:48.769193", "step": 2042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.799254", "step": 2042, "epoch": 3 }, { "type": "loss", "content": 0.012282871641218662, "timestamp": "2025-09-30 22:11:48.802790", "step": 2043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.832188", "step": 2043, "epoch": 3 }, { "type": "loss", "content": 0.015396817587316036, "timestamp": "2025-09-30 22:11:48.856754", "step": 2044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.889786", "step": 2044, "epoch": 3 }, { "type": "loss", "content": 0.0020254473201930523, "timestamp": "2025-09-30 22:11:48.893077", "step": 2045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.924965", "step": 2045, "epoch": 3 }, { "type": "loss", "content": 0.03287850692868233, "timestamp": "2025-09-30 22:11:48.927323", "step": 2046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.959676", "step": 2046, "epoch": 3 }, { "type": "loss", "content": 0.013674319721758366, "timestamp": "2025-09-30 22:11:48.962453", "step": 2047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:48.992228", "step": 2047, "epoch": 3 }, { "type": "loss", "content": 0.005486636888235807, "timestamp": "2025-09-30 22:11:49.015942", "step": 2048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:49.046842", "step": 2048, "epoch": 3 }, { "type": "loss", "content": 0.0018138373270630836, "timestamp": "2025-09-30 22:11:49.051151", "step": 2049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:49.085344", "step": 2049, "epoch": 3 }, { "type": "loss", "content": 0.007017158903181553, "timestamp": "2025-09-30 22:11:49.087734", "step": 2050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:49.120421", "step": 2050, "epoch": 3 }, { "type": "loss", "content": 0.006239436566829681, "timestamp": "2025-09-30 22:11:49.123116", "step": 2051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:49.162657", "step": 2051, "epoch": 3 }, { "type": "loss", "content": 0.0296341422945261, "timestamp": "2025-09-30 22:11:49.186855", "step": 2052, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:49.938037", "step": 2052, "epoch": 3 }, { "type": "pplx", "content": 46841069.929396845, "timestamp": "2025-09-30 22:11:49.940737", "step": 2052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:49.968858", "step": 2052, "epoch": 3 }, { "type": "loss", "content": 0.002824206370860338, "timestamp": "2025-09-30 22:11:49.971342", "step": 2053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.001368", "step": 2053, "epoch": 3 }, { "type": "loss", "content": 0.0025209374725818634, "timestamp": "2025-09-30 22:11:50.004275", "step": 2054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.035925", "step": 2054, "epoch": 3 }, { "type": "loss", "content": 0.005634330213069916, "timestamp": "2025-09-30 22:11:50.038218", "step": 2055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.068792", "step": 2055, "epoch": 3 }, { "type": "loss", "content": 0.001257882104255259, "timestamp": "2025-09-30 22:11:50.093493", "step": 2056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.123631", "step": 2056, "epoch": 3 }, { "type": "loss", "content": 0.0063269915990531445, "timestamp": "2025-09-30 22:11:50.126394", "step": 2057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.157085", "step": 2057, "epoch": 3 }, { "type": "loss", "content": 0.006260029971599579, "timestamp": "2025-09-30 22:11:50.159285", "step": 2058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.192581", "step": 2058, "epoch": 3 }, { "type": "loss", "content": 0.011972022242844105, "timestamp": "2025-09-30 22:11:50.194639", "step": 2059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.232178", "step": 2059, "epoch": 3 }, { "type": "loss", "content": 0.0028113038279116154, "timestamp": "2025-09-30 22:11:50.256226", "step": 2060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.286662", "step": 2060, "epoch": 3 }, { "type": "loss", "content": 0.00559657160192728, "timestamp": "2025-09-30 22:11:50.288730", "step": 2061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:50.318881", "step": 2061, "epoch": 3 }, { "type": "loss", "content": 0.0006818880210630596, "timestamp": "2025-09-30 22:11:50.324812", "step": 2062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:50.360667", "step": 2062, "epoch": 3 }, { "type": "loss", "content": 0.022419020533561707, "timestamp": "2025-09-30 22:11:50.363341", "step": 2063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.394960", "step": 2063, "epoch": 3 }, { "type": "loss", "content": 0.003586748382076621, "timestamp": "2025-09-30 22:11:50.418215", "step": 2064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.448556", "step": 2064, "epoch": 3 }, { "type": "loss", "content": 0.0011212294921278954, "timestamp": "2025-09-30 22:11:50.455815", "step": 2065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.501180", "step": 2065, "epoch": 3 }, { "type": "loss", "content": 0.0036978931166231632, "timestamp": "2025-09-30 22:11:50.503528", "step": 2066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:50.534271", "step": 2066, "epoch": 3 }, { "type": "loss", "content": 0.0025766324251890182, "timestamp": "2025-09-30 22:11:50.536403", "step": 2067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.566915", "step": 2067, "epoch": 3 }, { "type": "loss", "content": 0.024868767708539963, "timestamp": "2025-09-30 22:11:50.590573", "step": 2068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.623035", "step": 2068, "epoch": 3 }, { "type": "loss", "content": 0.0010927347466349602, "timestamp": "2025-09-30 22:11:50.625739", "step": 2069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.656140", "step": 2069, "epoch": 3 }, { "type": "loss", "content": 0.009025784209370613, "timestamp": "2025-09-30 22:11:50.658880", "step": 2070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:50.688974", "step": 2070, "epoch": 3 }, { "type": "loss", "content": 0.0004706038744188845, "timestamp": "2025-09-30 22:11:50.690971", "step": 2071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.720799", "step": 2071, "epoch": 3 }, { "type": "loss", "content": 0.023899797350168228, "timestamp": "2025-09-30 22:11:50.744231", "step": 2072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.781701", "step": 2072, "epoch": 3 }, { "type": "loss", "content": 0.016089284792542458, "timestamp": "2025-09-30 22:11:50.784228", "step": 2073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.815685", "step": 2073, "epoch": 3 }, { "type": "loss", "content": 0.0031490155961364508, "timestamp": "2025-09-30 22:11:50.817925", "step": 2074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:50.847379", "step": 2074, "epoch": 3 }, { "type": "loss", "content": 0.0023220006842166185, "timestamp": "2025-09-30 22:11:50.849849", "step": 2075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.879700", "step": 2075, "epoch": 3 }, { "type": "loss", "content": 0.005324964411556721, "timestamp": "2025-09-30 22:11:50.904241", "step": 2076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:50.935012", "step": 2076, "epoch": 3 }, { "type": "loss", "content": 0.005782104562968016, "timestamp": "2025-09-30 22:11:50.937191", "step": 2077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:50.969077", "step": 2077, "epoch": 3 }, { "type": "loss", "content": 0.02153591811656952, "timestamp": "2025-09-30 22:11:50.971267", "step": 2078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.003235", "step": 2078, "epoch": 3 }, { "type": "loss", "content": 0.029172439128160477, "timestamp": "2025-09-30 22:11:51.006114", "step": 2079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.052262", "step": 2079, "epoch": 3 }, { "type": "loss", "content": 0.0013484329683706164, "timestamp": "2025-09-30 22:11:51.075692", "step": 2080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.110716", "step": 2080, "epoch": 3 }, { "type": "loss", "content": 0.005239028949290514, "timestamp": "2025-09-30 22:11:51.112549", "step": 2081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.142907", "step": 2081, "epoch": 3 }, { "type": "loss", "content": 0.0015419954434037209, "timestamp": "2025-09-30 22:11:51.144985", "step": 2082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.174915", "step": 2082, "epoch": 3 }, { "type": "loss", "content": 0.0034008428920060396, "timestamp": "2025-09-30 22:11:51.177065", "step": 2083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.208306", "step": 2083, "epoch": 3 }, { "type": "loss", "content": 0.001755467732436955, "timestamp": "2025-09-30 22:11:51.232161", "step": 2084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.262286", "step": 2084, "epoch": 3 }, { "type": "loss", "content": 0.03544512018561363, "timestamp": "2025-09-30 22:11:51.264468", "step": 2085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.295461", "step": 2085, "epoch": 3 }, { "type": "loss", "content": 0.017085498198866844, "timestamp": "2025-09-30 22:11:51.298046", "step": 2086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.328296", "step": 2086, "epoch": 3 }, { "type": "loss", "content": 0.009373554959893227, "timestamp": "2025-09-30 22:11:51.330684", "step": 2087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.362616", "step": 2087, "epoch": 3 }, { "type": "loss", "content": 0.0034663775004446507, "timestamp": "2025-09-30 22:11:51.396071", "step": 2088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.427474", "step": 2088, "epoch": 3 }, { "type": "loss", "content": 0.0010567883728072047, "timestamp": "2025-09-30 22:11:51.429400", "step": 2089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:51.460199", "step": 2089, "epoch": 3 }, { "type": "loss", "content": 0.001030977233313024, "timestamp": "2025-09-30 22:11:51.464817", "step": 2090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.494928", "step": 2090, "epoch": 3 }, { "type": "loss", "content": 0.0021973750554025173, "timestamp": "2025-09-30 22:11:51.497848", "step": 2091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.528582", "step": 2091, "epoch": 3 }, { "type": "loss", "content": 0.01016116701066494, "timestamp": "2025-09-30 22:11:51.552300", "step": 2092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.583649", "step": 2092, "epoch": 3 }, { "type": "loss", "content": 0.002128815045580268, "timestamp": "2025-09-30 22:11:51.585550", "step": 2093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.619516", "step": 2093, "epoch": 3 }, { "type": "loss", "content": 0.0021064337342977524, "timestamp": "2025-09-30 22:11:51.622117", "step": 2094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.658479", "step": 2094, "epoch": 3 }, { "type": "loss", "content": 0.0030677791219204664, "timestamp": "2025-09-30 22:11:51.660548", "step": 2095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.693415", "step": 2095, "epoch": 3 }, { "type": "loss", "content": 0.04606208577752113, "timestamp": "2025-09-30 22:11:51.717523", "step": 2096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:51.748039", "step": 2096, "epoch": 3 }, { "type": "loss", "content": 0.00207854644395411, "timestamp": "2025-09-30 22:11:51.750250", "step": 2097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:51.781538", "step": 2097, "epoch": 3 }, { "type": "loss", "content": 0.025525161996483803, "timestamp": "2025-09-30 22:11:51.783799", "step": 2098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.813989", "step": 2098, "epoch": 3 }, { "type": "loss", "content": 0.0032558145467191935, "timestamp": "2025-09-30 22:11:51.816499", "step": 2099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:51.847061", "step": 2099, "epoch": 3 }, { "type": "loss", "content": 0.005950461141765118, "timestamp": "2025-09-30 22:11:51.872829", "step": 2100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:51.904319", "step": 2100, "epoch": 3 }, { "type": "loss", "content": 0.0006777332164347172, "timestamp": "2025-09-30 22:11:51.906638", "step": 2101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:51.936459", "step": 2101, "epoch": 3 }, { "type": "loss", "content": 0.0009718859218992293, "timestamp": "2025-09-30 22:11:51.938368", "step": 2102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:51.969245", "step": 2102, "epoch": 3 }, { "type": "loss", "content": 0.042966216802597046, "timestamp": "2025-09-30 22:11:51.971916", "step": 2103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:52.002329", "step": 2103, "epoch": 3 }, { "type": "loss", "content": 0.0009729270823299885, "timestamp": "2025-09-30 22:11:52.025887", "step": 2104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:52.056745", "step": 2104, "epoch": 3 }, { "type": "loss", "content": 0.0013779571745544672, "timestamp": "2025-09-30 22:11:52.058970", "step": 2105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:52.088875", "step": 2105, "epoch": 3 }, { "type": "loss", "content": 0.0018243792001158, "timestamp": "2025-09-30 22:11:52.091108", "step": 2106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:52.122468", "step": 2106, "epoch": 3 }, { "type": "loss", "content": 0.002409034175798297, "timestamp": "2025-09-30 22:11:52.125767", "step": 2107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:52.157168", "step": 2107, "epoch": 3 }, { "type": "loss", "content": 0.011469879187643528, "timestamp": "2025-09-30 22:11:52.180846", "step": 2108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:52.211732", "step": 2108, "epoch": 3 }, { "type": "loss", "content": 0.014377386309206486, "timestamp": "2025-09-30 22:11:52.213983", "step": 2109, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:53.069474", "step": 2109, "epoch": 3 }, { "type": "pplx", "content": 43296303.09889206, "timestamp": "2025-09-30 22:11:53.079360", "step": 2109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:53.109952", "step": 2109, "epoch": 3 }, { "type": "loss", "content": 0.0023443028330802917, "timestamp": "2025-09-30 22:11:53.114172", "step": 2110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.145304", "step": 2110, "epoch": 3 }, { "type": "loss", "content": 0.0002606787602417171, "timestamp": "2025-09-30 22:11:53.148125", "step": 2111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.179002", "step": 2111, "epoch": 3 }, { "type": "loss", "content": 0.004330683033913374, "timestamp": "2025-09-30 22:11:53.206509", "step": 2112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:53.243614", "step": 2112, "epoch": 3 }, { "type": "loss", "content": 0.003312209853902459, "timestamp": "2025-09-30 22:11:53.256965", "step": 2113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:53.294571", "step": 2113, "epoch": 3 }, { "type": "loss", "content": 0.022603752091526985, "timestamp": "2025-09-30 22:11:53.297170", "step": 2114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.330089", "step": 2114, "epoch": 3 }, { "type": "loss", "content": 0.000934332434553653, "timestamp": "2025-09-30 22:11:53.332258", "step": 2115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.366930", "step": 2115, "epoch": 3 }, { "type": "loss", "content": 0.006673852913081646, "timestamp": "2025-09-30 22:11:53.391771", "step": 2116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.422202", "step": 2116, "epoch": 3 }, { "type": "loss", "content": 0.0022406296338886023, "timestamp": "2025-09-30 22:11:53.424234", "step": 2117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:53.455438", "step": 2117, "epoch": 3 }, { "type": "loss", "content": 0.037457626312971115, "timestamp": "2025-09-30 22:11:53.457674", "step": 2118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.488008", "step": 2118, "epoch": 3 }, { "type": "loss", "content": 0.0011722417548298836, "timestamp": "2025-09-30 22:11:53.490242", "step": 2119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.520783", "step": 2119, "epoch": 3 }, { "type": "loss", "content": 0.0004422049969434738, "timestamp": "2025-09-30 22:11:53.544261", "step": 2120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:53.575777", "step": 2120, "epoch": 3 }, { "type": "loss", "content": 0.0010047383839264512, "timestamp": "2025-09-30 22:11:53.577754", "step": 2121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:53.608143", "step": 2121, "epoch": 3 }, { "type": "loss", "content": 0.012830023653805256, "timestamp": "2025-09-30 22:11:53.610547", "step": 2122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.641410", "step": 2122, "epoch": 3 }, { "type": "loss", "content": 0.023233426734805107, "timestamp": "2025-09-30 22:11:53.643478", "step": 2123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.675493", "step": 2123, "epoch": 3 }, { "type": "loss", "content": 0.0013475000159814954, "timestamp": "2025-09-30 22:11:53.699331", "step": 2124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.730528", "step": 2124, "epoch": 3 }, { "type": "loss", "content": 0.005780854728072882, "timestamp": "2025-09-30 22:11:53.732761", "step": 2125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:53.763686", "step": 2125, "epoch": 3 }, { "type": "loss", "content": 0.002301039407029748, "timestamp": "2025-09-30 22:11:53.765767", "step": 2126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.796135", "step": 2126, "epoch": 3 }, { "type": "loss", "content": 0.011333170346915722, "timestamp": "2025-09-30 22:11:53.798286", "step": 2127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.831111", "step": 2127, "epoch": 3 }, { "type": "loss", "content": 0.013510367833077908, "timestamp": "2025-09-30 22:11:53.854556", "step": 2128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.885787", "step": 2128, "epoch": 3 }, { "type": "loss", "content": 0.003286774503067136, "timestamp": "2025-09-30 22:11:53.888009", "step": 2129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:53.922905", "step": 2129, "epoch": 3 }, { "type": "loss", "content": 0.002777442801743746, "timestamp": "2025-09-30 22:11:53.925253", "step": 2130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.956869", "step": 2130, "epoch": 3 }, { "type": "loss", "content": 0.0014969798503443599, "timestamp": "2025-09-30 22:11:53.958915", "step": 2131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:53.989365", "step": 2131, "epoch": 3 }, { "type": "loss", "content": 0.006633365992456675, "timestamp": "2025-09-30 22:11:54.012958", "step": 2132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:54.043183", "step": 2132, "epoch": 3 }, { "type": "loss", "content": 0.009707632474601269, "timestamp": "2025-09-30 22:11:54.045969", "step": 2133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:54.078377", "step": 2133, "epoch": 3 }, { "type": "loss", "content": 0.0007816283032298088, "timestamp": "2025-09-30 22:11:54.084173", "step": 2134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.115541", "step": 2134, "epoch": 3 }, { "type": "loss", "content": 0.0020825075916945934, "timestamp": "2025-09-30 22:11:54.117655", "step": 2135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:54.160331", "step": 2135, "epoch": 3 }, { "type": "loss", "content": 0.0006500629824586213, "timestamp": "2025-09-30 22:11:54.185025", "step": 2136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.215733", "step": 2136, "epoch": 3 }, { "type": "loss", "content": 0.004673543851822615, "timestamp": "2025-09-30 22:11:54.217989", "step": 2137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.249423", "step": 2137, "epoch": 3 }, { "type": "loss", "content": 0.005406899843364954, "timestamp": "2025-09-30 22:11:54.252433", "step": 2138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.287182", "step": 2138, "epoch": 3 }, { "type": "loss", "content": 0.000536845182068646, "timestamp": "2025-09-30 22:11:54.289777", "step": 2139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.326367", "step": 2139, "epoch": 3 }, { "type": "loss", "content": 0.0016699606785550714, "timestamp": "2025-09-30 22:11:54.351702", "step": 2140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.383402", "step": 2140, "epoch": 3 }, { "type": "loss", "content": 0.0018563418416306376, "timestamp": "2025-09-30 22:11:54.386335", "step": 2141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.418835", "step": 2141, "epoch": 3 }, { "type": "loss", "content": 0.002186473226174712, "timestamp": "2025-09-30 22:11:54.422064", "step": 2142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.457194", "step": 2142, "epoch": 3 }, { "type": "loss", "content": 0.0011395520996302366, "timestamp": "2025-09-30 22:11:54.460279", "step": 2143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.491473", "step": 2143, "epoch": 3 }, { "type": "loss", "content": 0.005414300598204136, "timestamp": "2025-09-30 22:11:54.518972", "step": 2144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.556454", "step": 2144, "epoch": 3 }, { "type": "loss", "content": 0.001469918410293758, "timestamp": "2025-09-30 22:11:54.558496", "step": 2145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.590782", "step": 2145, "epoch": 3 }, { "type": "loss", "content": 0.0007216363446787, "timestamp": "2025-09-30 22:11:54.593067", "step": 2146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.627549", "step": 2146, "epoch": 3 }, { "type": "loss", "content": 0.003747421083971858, "timestamp": "2025-09-30 22:11:54.629824", "step": 2147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.660401", "step": 2147, "epoch": 3 }, { "type": "loss", "content": 0.001542619545944035, "timestamp": "2025-09-30 22:11:54.684354", "step": 2148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.717346", "step": 2148, "epoch": 3 }, { "type": "loss", "content": 0.006204643286764622, "timestamp": "2025-09-30 22:11:54.719616", "step": 2149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.751906", "step": 2149, "epoch": 3 }, { "type": "loss", "content": 0.0003929475205950439, "timestamp": "2025-09-30 22:11:54.757081", "step": 2150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.790172", "step": 2150, "epoch": 3 }, { "type": "loss", "content": 0.01017941627651453, "timestamp": "2025-09-30 22:11:54.792528", "step": 2151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:54.828313", "step": 2151, "epoch": 3 }, { "type": "loss", "content": 0.005522797349840403, "timestamp": "2025-09-30 22:11:54.852416", "step": 2152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.884029", "step": 2152, "epoch": 3 }, { "type": "loss", "content": 0.0026290721725672483, "timestamp": "2025-09-30 22:11:54.886810", "step": 2153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:54.917613", "step": 2153, "epoch": 3 }, { "type": "loss", "content": 0.0038917474448680878, "timestamp": "2025-09-30 22:11:54.919671", "step": 2154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:54.950099", "step": 2154, "epoch": 3 }, { "type": "loss", "content": 0.003153451019898057, "timestamp": "2025-09-30 22:11:54.952576", "step": 2155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:54.985718", "step": 2155, "epoch": 3 }, { "type": "loss", "content": 0.0026568372268229723, "timestamp": "2025-09-30 22:11:55.009286", "step": 2156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:55.042409", "step": 2156, "epoch": 3 }, { "type": "loss", "content": 0.003504603635519743, "timestamp": "2025-09-30 22:11:55.046968", "step": 2157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:55.077820", "step": 2157, "epoch": 3 }, { "type": "loss", "content": 0.0010103067616000772, "timestamp": "2025-09-30 22:11:55.084037", "step": 2158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:55.121773", "step": 2158, "epoch": 3 }, { "type": "loss", "content": 0.008074522949755192, "timestamp": "2025-09-30 22:11:55.124579", "step": 2159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:55.167900", "step": 2159, "epoch": 3 }, { "type": "loss", "content": 0.002990599488839507, "timestamp": "2025-09-30 22:11:55.191805", "step": 2160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:55.224558", "step": 2160, "epoch": 3 }, { "type": "loss", "content": 0.014099623076617718, "timestamp": "2025-09-30 22:11:55.226571", "step": 2161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:55.257948", "step": 2161, "epoch": 3 }, { "type": "loss", "content": 0.0026668712962418795, "timestamp": "2025-09-30 22:11:55.260610", "step": 2162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:55.291956", "step": 2162, "epoch": 3 }, { "type": "loss", "content": 0.0051649161614477634, "timestamp": "2025-09-30 22:11:55.295302", "step": 2163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:55.330018", "step": 2163, "epoch": 3 }, { "type": "loss", "content": 0.003950497601181269, "timestamp": "2025-09-30 22:11:55.353385", "step": 2164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:55.386345", "step": 2164, "epoch": 3 }, { "type": "loss", "content": 0.0007944428361952305, "timestamp": "2025-09-30 22:11:55.389019", "step": 2165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:55.420687", "step": 2165, "epoch": 3 }, { "type": "loss", "content": 0.0038365547079592943, "timestamp": "2025-09-30 22:11:55.423338", "step": 2166, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:56.276475", "step": 2166, "epoch": 3 }, { "type": "pplx", "content": 41345378.60273332, "timestamp": "2025-09-30 22:11:56.278302", "step": 2166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:56.306541", "step": 2166, "epoch": 3 }, { "type": "loss", "content": 0.031493984162807465, "timestamp": "2025-09-30 22:11:56.308282", "step": 2167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.338618", "step": 2167, "epoch": 3 }, { "type": "loss", "content": 0.006299360655248165, "timestamp": "2025-09-30 22:11:56.362412", "step": 2168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.397194", "step": 2168, "epoch": 3 }, { "type": "loss", "content": 0.0030568100046366453, "timestamp": "2025-09-30 22:11:56.399331", "step": 2169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:56.433632", "step": 2169, "epoch": 3 }, { "type": "loss", "content": 0.00035096920328214765, "timestamp": "2025-09-30 22:11:56.436242", "step": 2170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.468904", "step": 2170, "epoch": 3 }, { "type": "loss", "content": 0.001514105941168964, "timestamp": "2025-09-30 22:11:56.470954", "step": 2171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.502330", "step": 2171, "epoch": 3 }, { "type": "loss", "content": 0.000387070031138137, "timestamp": "2025-09-30 22:11:56.526142", "step": 2172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.556135", "step": 2172, "epoch": 3 }, { "type": "loss", "content": 0.0033165724016726017, "timestamp": "2025-09-30 22:11:56.558296", "step": 2173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.588419", "step": 2173, "epoch": 3 }, { "type": "loss", "content": 0.005177160259336233, "timestamp": "2025-09-30 22:11:56.591005", "step": 2174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:56.621126", "step": 2174, "epoch": 3 }, { "type": "loss", "content": 0.002708001295104623, "timestamp": "2025-09-30 22:11:56.623111", "step": 2175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.654412", "step": 2175, "epoch": 3 }, { "type": "loss", "content": 0.0013978255447000265, "timestamp": "2025-09-30 22:11:56.678894", "step": 2176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.709023", "step": 2176, "epoch": 3 }, { "type": "loss", "content": 0.0025910213589668274, "timestamp": "2025-09-30 22:11:56.710820", "step": 2177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.743305", "step": 2177, "epoch": 3 }, { "type": "loss", "content": 0.06920283287763596, "timestamp": "2025-09-30 22:11:56.745056", "step": 2178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.775267", "step": 2178, "epoch": 3 }, { "type": "loss", "content": 0.0009130876278504729, "timestamp": "2025-09-30 22:11:56.777823", "step": 2179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.809419", "step": 2179, "epoch": 3 }, { "type": "loss", "content": 0.0014621271984651685, "timestamp": "2025-09-30 22:11:56.833406", "step": 2180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.863839", "step": 2180, "epoch": 3 }, { "type": "loss", "content": 0.0005752498982474208, "timestamp": "2025-09-30 22:11:56.865798", "step": 2181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.895858", "step": 2181, "epoch": 3 }, { "type": "loss", "content": 0.0007989993318915367, "timestamp": "2025-09-30 22:11:56.898158", "step": 2182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:56.928574", "step": 2182, "epoch": 3 }, { "type": "loss", "content": 0.0009071322274394333, "timestamp": "2025-09-30 22:11:56.930839", "step": 2183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:56.962076", "step": 2183, "epoch": 3 }, { "type": "loss", "content": 0.0018987265648320317, "timestamp": "2025-09-30 22:11:56.985837", "step": 2184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.015617", "step": 2184, "epoch": 3 }, { "type": "loss", "content": 0.029847219586372375, "timestamp": "2025-09-30 22:11:57.025141", "step": 2185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.055917", "step": 2185, "epoch": 3 }, { "type": "loss", "content": 0.001003224402666092, "timestamp": "2025-09-30 22:11:57.058942", "step": 2186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.090904", "step": 2186, "epoch": 3 }, { "type": "loss", "content": 0.023648729547858238, "timestamp": "2025-09-30 22:11:57.094613", "step": 2187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.126710", "step": 2187, "epoch": 3 }, { "type": "loss", "content": 0.003437680657953024, "timestamp": "2025-09-30 22:11:57.149958", "step": 2188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.179942", "step": 2188, "epoch": 3 }, { "type": "loss", "content": 0.003817408112809062, "timestamp": "2025-09-30 22:11:57.182016", "step": 2189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.212247", "step": 2189, "epoch": 3 }, { "type": "loss", "content": 0.0008727815002202988, "timestamp": "2025-09-30 22:11:57.215645", "step": 2190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.252181", "step": 2190, "epoch": 3 }, { "type": "loss", "content": 0.01654052920639515, "timestamp": "2025-09-30 22:11:57.255333", "step": 2191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.287622", "step": 2191, "epoch": 3 }, { "type": "loss", "content": 0.00038399777258746326, "timestamp": "2025-09-30 22:11:57.311839", "step": 2192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.346721", "step": 2192, "epoch": 3 }, { "type": "loss", "content": 0.00027529371436685324, "timestamp": "2025-09-30 22:11:57.348871", "step": 2193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:57.386221", "step": 2193, "epoch": 3 }, { "type": "loss", "content": 0.024949418380856514, "timestamp": "2025-09-30 22:11:57.392977", "step": 2194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:57.432676", "step": 2194, "epoch": 3 }, { "type": "loss", "content": 0.0010464598890393972, "timestamp": "2025-09-30 22:11:57.435229", "step": 2195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.482869", "step": 2195, "epoch": 3 }, { "type": "loss", "content": 0.040087323635816574, "timestamp": "2025-09-30 22:11:57.509506", "step": 2196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.544617", "step": 2196, "epoch": 3 }, { "type": "loss", "content": 0.01321853045374155, "timestamp": "2025-09-30 22:11:57.548530", "step": 2197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.581886", "step": 2197, "epoch": 3 }, { "type": "loss", "content": 0.00213692057877779, "timestamp": "2025-09-30 22:11:57.585034", "step": 2198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.618036", "step": 2198, "epoch": 3 }, { "type": "loss", "content": 0.0006283451803028584, "timestamp": "2025-09-30 22:11:57.620431", "step": 2199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.653624", "step": 2199, "epoch": 3 }, { "type": "loss", "content": 0.0003651838924270123, "timestamp": "2025-09-30 22:11:57.678718", "step": 2200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:57.718869", "step": 2200, "epoch": 3 }, { "type": "loss", "content": 0.0010239933617413044, "timestamp": "2025-09-30 22:11:57.720993", "step": 2201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.754953", "step": 2201, "epoch": 3 }, { "type": "loss", "content": 0.028298767283558846, "timestamp": "2025-09-30 22:11:57.757035", "step": 2202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.791476", "step": 2202, "epoch": 3 }, { "type": "loss", "content": 0.0006448552594520152, "timestamp": "2025-09-30 22:11:57.795278", "step": 2203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.831152", "step": 2203, "epoch": 3 }, { "type": "loss", "content": 0.0008334387093782425, "timestamp": "2025-09-30 22:11:57.860596", "step": 2204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.895575", "step": 2204, "epoch": 3 }, { "type": "loss", "content": 0.0005287216627039015, "timestamp": "2025-09-30 22:11:57.898566", "step": 2205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:57.935665", "step": 2205, "epoch": 3 }, { "type": "loss", "content": 0.0017126341117545962, "timestamp": "2025-09-30 22:11:57.939812", "step": 2206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:57.974133", "step": 2206, "epoch": 3 }, { "type": "loss", "content": 0.0010332076344639063, "timestamp": "2025-09-30 22:11:57.976496", "step": 2207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.022134", "step": 2207, "epoch": 3 }, { "type": "loss", "content": 0.01596010848879814, "timestamp": "2025-09-30 22:11:58.046395", "step": 2208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.087991", "step": 2208, "epoch": 3 }, { "type": "loss", "content": 0.0012377096572890878, "timestamp": "2025-09-30 22:11:58.094121", "step": 2209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.127307", "step": 2209, "epoch": 3 }, { "type": "loss", "content": 0.0016445504734292626, "timestamp": "2025-09-30 22:11:58.133321", "step": 2210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.169585", "step": 2210, "epoch": 3 }, { "type": "loss", "content": 0.00024720263900235295, "timestamp": "2025-09-30 22:11:58.171920", "step": 2211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.208594", "step": 2211, "epoch": 3 }, { "type": "loss", "content": 0.0014899687375873327, "timestamp": "2025-09-30 22:11:58.233825", "step": 2212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.265323", "step": 2212, "epoch": 3 }, { "type": "loss", "content": 0.01058391947299242, "timestamp": "2025-09-30 22:11:58.267681", "step": 2213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:58.301220", "step": 2213, "epoch": 3 }, { "type": "loss", "content": 0.0021572033874690533, "timestamp": "2025-09-30 22:11:58.304807", "step": 2214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.341173", "step": 2214, "epoch": 3 }, { "type": "loss", "content": 0.0013435721630230546, "timestamp": "2025-09-30 22:11:58.344084", "step": 2215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:11:58.379137", "step": 2215, "epoch": 3 }, { "type": "loss", "content": 0.014729948714375496, "timestamp": "2025-09-30 22:11:58.403177", "step": 2216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.435477", "step": 2216, "epoch": 3 }, { "type": "loss", "content": 0.006870845798403025, "timestamp": "2025-09-30 22:11:58.442814", "step": 2217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:58.478839", "step": 2217, "epoch": 3 }, { "type": "loss", "content": 0.025339728221297264, "timestamp": "2025-09-30 22:11:58.482003", "step": 2218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:58.520112", "step": 2218, "epoch": 3 }, { "type": "loss", "content": 0.0009052485111169517, "timestamp": "2025-09-30 22:11:58.522814", "step": 2219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.554393", "step": 2219, "epoch": 3 }, { "type": "loss", "content": 0.0033087660558521748, "timestamp": "2025-09-30 22:11:58.580417", "step": 2220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.621971", "step": 2220, "epoch": 3 }, { "type": "loss", "content": 0.0004655886150430888, "timestamp": "2025-09-30 22:11:58.624104", "step": 2221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:58.659507", "step": 2221, "epoch": 3 }, { "type": "loss", "content": 0.001055079628713429, "timestamp": "2025-09-30 22:11:58.662247", "step": 2222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:11:58.701919", "step": 2222, "epoch": 3 }, { "type": "loss", "content": 0.0007419497705996037, "timestamp": "2025-09-30 22:11:58.706965", "step": 2223, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:11:59.612031", "step": 2223, "epoch": 3 }, { "type": "pplx", "content": 41080650.73787688, "timestamp": "2025-09-30 22:11:59.614983", "step": 2223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.651495", "step": 2223, "epoch": 3 }, { "type": "loss", "content": 0.0021477299742400646, "timestamp": "2025-09-30 22:11:59.676629", "step": 2224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.709738", "step": 2224, "epoch": 3 }, { "type": "loss", "content": 0.002651253715157509, "timestamp": "2025-09-30 22:11:59.724011", "step": 2225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.759451", "step": 2225, "epoch": 3 }, { "type": "loss", "content": 0.001111521851271391, "timestamp": "2025-09-30 22:11:59.767172", "step": 2226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.802065", "step": 2226, "epoch": 3 }, { "type": "loss", "content": 0.011532140895724297, "timestamp": "2025-09-30 22:11:59.805099", "step": 2227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:11:59.852920", "step": 2227, "epoch": 3 }, { "type": "loss", "content": 0.012329302728176117, "timestamp": "2025-09-30 22:11:59.880360", "step": 2228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.917214", "step": 2228, "epoch": 3 }, { "type": "loss", "content": 0.00019277750106994063, "timestamp": "2025-09-30 22:11:59.920528", "step": 2229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.953450", "step": 2229, "epoch": 3 }, { "type": "loss", "content": 0.0016648133751004934, "timestamp": "2025-09-30 22:11:59.956007", "step": 2230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:11:59.987874", "step": 2230, "epoch": 3 }, { "type": "loss", "content": 0.00010171485337195918, "timestamp": "2025-09-30 22:11:59.990671", "step": 2231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:00.023025", "step": 2231, "epoch": 3 }, { "type": "loss", "content": 0.003272840054705739, "timestamp": "2025-09-30 22:12:00.051308", "step": 2232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.098015", "step": 2232, "epoch": 3 }, { "type": "loss", "content": 0.0023247734643518925, "timestamp": "2025-09-30 22:12:00.100228", "step": 2233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.144520", "step": 2233, "epoch": 3 }, { "type": "loss", "content": 0.0013171957107260823, "timestamp": "2025-09-30 22:12:00.147086", "step": 2234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.190259", "step": 2234, "epoch": 3 }, { "type": "loss", "content": 0.0005580909783020616, "timestamp": "2025-09-30 22:12:00.194040", "step": 2235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:00.236684", "step": 2235, "epoch": 3 }, { "type": "loss", "content": 0.0006598988547921181, "timestamp": "2025-09-30 22:12:00.267458", "step": 2236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:00.305777", "step": 2236, "epoch": 3 }, { "type": "loss", "content": 0.002186252735555172, "timestamp": "2025-09-30 22:12:00.312410", "step": 2237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.357546", "step": 2237, "epoch": 3 }, { "type": "loss", "content": 0.006100684404373169, "timestamp": "2025-09-30 22:12:00.360014", "step": 2238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:00.400839", "step": 2238, "epoch": 3 }, { "type": "loss", "content": 0.03646007925271988, "timestamp": "2025-09-30 22:12:00.404742", "step": 2239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:00.437361", "step": 2239, "epoch": 3 }, { "type": "loss", "content": 0.003415354760363698, "timestamp": "2025-09-30 22:12:00.461510", "step": 2240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.494876", "step": 2240, "epoch": 3 }, { "type": "loss", "content": 0.0009296032367274165, "timestamp": "2025-09-30 22:12:00.498079", "step": 2241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:00.544104", "step": 2241, "epoch": 3 }, { "type": "loss", "content": 0.00020019887597300112, "timestamp": "2025-09-30 22:12:00.553285", "step": 2242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.584744", "step": 2242, "epoch": 3 }, { "type": "loss", "content": 0.000863843597471714, "timestamp": "2025-09-30 22:12:00.588385", "step": 2243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.623776", "step": 2243, "epoch": 3 }, { "type": "loss", "content": 0.0016993492608889937, "timestamp": "2025-09-30 22:12:00.653484", "step": 2244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:00.702084", "step": 2244, "epoch": 3 }, { "type": "loss", "content": 0.0019676017109304667, "timestamp": "2025-09-30 22:12:00.705865", "step": 2245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:00.742784", "step": 2245, "epoch": 3 }, { "type": "loss", "content": 0.00311626517213881, "timestamp": "2025-09-30 22:12:00.747544", "step": 2246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.784398", "step": 2246, "epoch": 3 }, { "type": "loss", "content": 0.002856964012607932, "timestamp": "2025-09-30 22:12:00.791186", "step": 2247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.823880", "step": 2247, "epoch": 3 }, { "type": "loss", "content": 0.004770600702613592, "timestamp": "2025-09-30 22:12:00.848076", "step": 2248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.880530", "step": 2248, "epoch": 3 }, { "type": "loss", "content": 0.00405394472181797, "timestamp": "2025-09-30 22:12:00.886406", "step": 2249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.917905", "step": 2249, "epoch": 3 }, { "type": "loss", "content": 0.00012620205234270543, "timestamp": "2025-09-30 22:12:00.920939", "step": 2250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.956680", "step": 2250, "epoch": 3 }, { "type": "loss", "content": 0.0037796092219650745, "timestamp": "2025-09-30 22:12:00.959116", "step": 2251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:00.998630", "step": 2251, "epoch": 3 }, { "type": "loss", "content": 0.0041126045398414135, "timestamp": "2025-09-30 22:12:01.023514", "step": 2252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.063613", "step": 2252, "epoch": 3 }, { "type": "loss", "content": 0.0031900478061288595, "timestamp": "2025-09-30 22:12:01.066754", "step": 2253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:01.102425", "step": 2253, "epoch": 3 }, { "type": "loss", "content": 0.0007147821015678346, "timestamp": "2025-09-30 22:12:01.108462", "step": 2254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.156598", "step": 2254, "epoch": 3 }, { "type": "loss", "content": 0.0002387921849731356, "timestamp": "2025-09-30 22:12:01.160749", "step": 2255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:01.206689", "step": 2255, "epoch": 3 }, { "type": "loss", "content": 8.921477274270728e-05, "timestamp": "2025-09-30 22:12:01.231930", "step": 2256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:01.289999", "step": 2256, "epoch": 3 }, { "type": "loss", "content": 0.00011117455142084509, "timestamp": "2025-09-30 22:12:01.293131", "step": 2257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:01.327003", "step": 2257, "epoch": 3 }, { "type": "loss", "content": 0.002855940954759717, "timestamp": "2025-09-30 22:12:01.329937", "step": 2258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:01.364426", "step": 2258, "epoch": 3 }, { "type": "loss", "content": 0.0015121333999559283, "timestamp": "2025-09-30 22:12:01.368663", "step": 2259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:01.400739", "step": 2259, "epoch": 3 }, { "type": "loss", "content": 0.004547151271253824, "timestamp": "2025-09-30 22:12:01.425741", "step": 2260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.472112", "step": 2260, "epoch": 3 }, { "type": "loss", "content": 0.0014173558447510004, "timestamp": "2025-09-30 22:12:01.475511", "step": 2261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.509433", "step": 2261, "epoch": 3 }, { "type": "loss", "content": 0.0007967141573317349, "timestamp": "2025-09-30 22:12:01.513037", "step": 2262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.551003", "step": 2262, "epoch": 3 }, { "type": "loss", "content": 0.004071411211043596, "timestamp": "2025-09-30 22:12:01.554383", "step": 2263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:01.601596", "step": 2263, "epoch": 3 }, { "type": "loss", "content": 0.001577091054059565, "timestamp": "2025-09-30 22:12:01.630556", "step": 2264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.685148", "step": 2264, "epoch": 3 }, { "type": "loss", "content": 7.375147106358781e-05, "timestamp": "2025-09-30 22:12:01.687706", "step": 2265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:01.728395", "step": 2265, "epoch": 3 }, { "type": "loss", "content": 0.0009554016287438571, "timestamp": "2025-09-30 22:12:01.736485", "step": 2266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.770317", "step": 2266, "epoch": 3 }, { "type": "loss", "content": 0.003659574780613184, "timestamp": "2025-09-30 22:12:01.773461", "step": 2267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:01.806720", "step": 2267, "epoch": 3 }, { "type": "loss", "content": 0.005300324410200119, "timestamp": "2025-09-30 22:12:01.830722", "step": 2268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.872522", "step": 2268, "epoch": 3 }, { "type": "loss", "content": 0.0002838976215571165, "timestamp": "2025-09-30 22:12:01.876519", "step": 2269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.908720", "step": 2269, "epoch": 3 }, { "type": "loss", "content": 0.00027007676544599235, "timestamp": "2025-09-30 22:12:01.911009", "step": 2270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:01.957192", "step": 2270, "epoch": 3 }, { "type": "loss", "content": 0.0017464784905314445, "timestamp": "2025-09-30 22:12:01.963999", "step": 2271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.001305", "step": 2271, "epoch": 3 }, { "type": "loss", "content": 0.0007175366627052426, "timestamp": "2025-09-30 22:12:02.029407", "step": 2272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.092776", "step": 2272, "epoch": 3 }, { "type": "loss", "content": 0.004835275001823902, "timestamp": "2025-09-30 22:12:02.095027", "step": 2273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.130028", "step": 2273, "epoch": 3 }, { "type": "loss", "content": 0.0029070251621305943, "timestamp": "2025-09-30 22:12:02.137218", "step": 2274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.169591", "step": 2274, "epoch": 3 }, { "type": "loss", "content": 0.012067276053130627, "timestamp": "2025-09-30 22:12:02.175245", "step": 2275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.220400", "step": 2275, "epoch": 3 }, { "type": "loss", "content": 0.009939714334905148, "timestamp": "2025-09-30 22:12:02.244711", "step": 2276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:02.290047", "step": 2276, "epoch": 3 }, { "type": "loss", "content": 0.0007312466041184962, "timestamp": "2025-09-30 22:12:02.292294", "step": 2277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:02.331166", "step": 2277, "epoch": 3 }, { "type": "loss", "content": 0.0020863066893070936, "timestamp": "2025-09-30 22:12:02.335425", "step": 2278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.370381", "step": 2278, "epoch": 3 }, { "type": "loss", "content": 0.0002355527103645727, "timestamp": "2025-09-30 22:12:02.373473", "step": 2279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:02.405483", "step": 2279, "epoch": 3 }, { "type": "loss", "content": 0.007479370106011629, "timestamp": "2025-09-30 22:12:02.430228", "step": 2280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:03.399586", "step": 2280, "epoch": 3 }, { "type": "pplx", "content": 46849847.313949354, "timestamp": "2025-09-30 22:12:03.407812", "step": 2280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:03.436963", "step": 2280, "epoch": 3 }, { "type": "loss", "content": 0.001072447281330824, "timestamp": "2025-09-30 22:12:03.440189", "step": 2281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.473962", "step": 2281, "epoch": 3 }, { "type": "loss", "content": 0.0004119630320928991, "timestamp": "2025-09-30 22:12:03.482672", "step": 2282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:03.519456", "step": 2282, "epoch": 3 }, { "type": "loss", "content": 0.00015061446174513549, "timestamp": "2025-09-30 22:12:03.523228", "step": 2283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.556675", "step": 2283, "epoch": 3 }, { "type": "loss", "content": 0.000557638006284833, "timestamp": "2025-09-30 22:12:03.585615", "step": 2284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.631870", "step": 2284, "epoch": 3 }, { "type": "loss", "content": 0.0005773733137175441, "timestamp": "2025-09-30 22:12:03.634814", "step": 2285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.675837", "step": 2285, "epoch": 3 }, { "type": "loss", "content": 0.00030607712687924504, "timestamp": "2025-09-30 22:12:03.683777", "step": 2286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.717388", "step": 2286, "epoch": 3 }, { "type": "loss", "content": 0.0001611269690329209, "timestamp": "2025-09-30 22:12:03.719972", "step": 2287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:03.756677", "step": 2287, "epoch": 3 }, { "type": "loss", "content": 0.0007993626059032977, "timestamp": "2025-09-30 22:12:03.781334", "step": 2288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:03.821981", "step": 2288, "epoch": 3 }, { "type": "loss", "content": 9.523901098873466e-05, "timestamp": "2025-09-30 22:12:03.825103", "step": 2289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.858730", "step": 2289, "epoch": 3 }, { "type": "loss", "content": 0.000503116229083389, "timestamp": "2025-09-30 22:12:03.862442", "step": 2290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.897690", "step": 2290, "epoch": 3 }, { "type": "loss", "content": 0.0010981824016198516, "timestamp": "2025-09-30 22:12:03.901476", "step": 2291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:03.933846", "step": 2291, "epoch": 3 }, { "type": "loss", "content": 0.003907725214958191, "timestamp": "2025-09-30 22:12:03.958803", "step": 2292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:03.997275", "step": 2292, "epoch": 3 }, { "type": "loss", "content": 0.0007813084521330893, "timestamp": "2025-09-30 22:12:04.006086", "step": 2293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:04.044795", "step": 2293, "epoch": 3 }, { "type": "loss", "content": 0.0015995989087969065, "timestamp": "2025-09-30 22:12:04.052998", "step": 2294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.106455", "step": 2294, "epoch": 3 }, { "type": "loss", "content": 0.008173911832273006, "timestamp": "2025-09-30 22:12:04.112277", "step": 2295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.157702", "step": 2295, "epoch": 3 }, { "type": "loss", "content": 0.006400450598448515, "timestamp": "2025-09-30 22:12:04.182227", "step": 2296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.218839", "step": 2296, "epoch": 3 }, { "type": "loss", "content": 0.001864130492322147, "timestamp": "2025-09-30 22:12:04.222678", "step": 2297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.256426", "step": 2297, "epoch": 3 }, { "type": "loss", "content": 0.002440927317366004, "timestamp": "2025-09-30 22:12:04.259903", "step": 2298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.299811", "step": 2298, "epoch": 3 }, { "type": "loss", "content": 0.0001420110056642443, "timestamp": "2025-09-30 22:12:04.303080", "step": 2299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.349103", "step": 2299, "epoch": 3 }, { "type": "loss", "content": 0.00029452916351146996, "timestamp": "2025-09-30 22:12:04.384052", "step": 2300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.415818", "step": 2300, "epoch": 3 }, { "type": "loss", "content": 0.00028071500128135085, "timestamp": "2025-09-30 22:12:04.418464", "step": 2301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.467075", "step": 2301, "epoch": 3 }, { "type": "loss", "content": 0.00010159516386920586, "timestamp": "2025-09-30 22:12:04.470372", "step": 2302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.508587", "step": 2302, "epoch": 3 }, { "type": "loss", "content": 0.024396846070885658, "timestamp": "2025-09-30 22:12:04.516211", "step": 2303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.556466", "step": 2303, "epoch": 3 }, { "type": "loss", "content": 0.03193879500031471, "timestamp": "2025-09-30 22:12:04.580809", "step": 2304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.627001", "step": 2304, "epoch": 3 }, { "type": "loss", "content": 0.00011379749776097015, "timestamp": "2025-09-30 22:12:04.635055", "step": 2305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.672852", "step": 2305, "epoch": 3 }, { "type": "loss", "content": 0.0007863629725761712, "timestamp": "2025-09-30 22:12:04.680034", "step": 2306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:04.714575", "step": 2306, "epoch": 3 }, { "type": "loss", "content": 0.019120637327432632, "timestamp": "2025-09-30 22:12:04.717585", "step": 2307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:04.750404", "step": 2307, "epoch": 3 }, { "type": "loss", "content": 0.013492346741259098, "timestamp": "2025-09-30 22:12:04.778357", "step": 2308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:04.810855", "step": 2308, "epoch": 3 }, { "type": "loss", "content": 0.0003213980817236006, "timestamp": "2025-09-30 22:12:04.818468", "step": 2309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.867683", "step": 2309, "epoch": 3 }, { "type": "loss", "content": 0.00035105025744996965, "timestamp": "2025-09-30 22:12:04.874844", "step": 2310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.928073", "step": 2310, "epoch": 3 }, { "type": "loss", "content": 0.00015687046106904745, "timestamp": "2025-09-30 22:12:04.931276", "step": 2311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:04.974813", "step": 2311, "epoch": 3 }, { "type": "loss", "content": 0.0003475369594525546, "timestamp": "2025-09-30 22:12:05.003685", "step": 2312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.046387", "step": 2312, "epoch": 3 }, { "type": "loss", "content": 0.0002608960203360766, "timestamp": "2025-09-30 22:12:05.059813", "step": 2313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.098699", "step": 2313, "epoch": 3 }, { "type": "loss", "content": 0.00015405855083372444, "timestamp": "2025-09-30 22:12:05.105923", "step": 2314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.150191", "step": 2314, "epoch": 3 }, { "type": "loss", "content": 0.007762204390019178, "timestamp": "2025-09-30 22:12:05.153871", "step": 2315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.187810", "step": 2315, "epoch": 3 }, { "type": "loss", "content": 0.000645505147986114, "timestamp": "2025-09-30 22:12:05.212043", "step": 2316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:05.254396", "step": 2316, "epoch": 3 }, { "type": "loss", "content": 0.01529739797115326, "timestamp": "2025-09-30 22:12:05.258026", "step": 2317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.298298", "step": 2317, "epoch": 3 }, { "type": "loss", "content": 0.0003246499109081924, "timestamp": "2025-09-30 22:12:05.308497", "step": 2318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:05.343271", "step": 2318, "epoch": 3 }, { "type": "loss", "content": 0.002341361017897725, "timestamp": "2025-09-30 22:12:05.346182", "step": 2319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.400375", "step": 2319, "epoch": 3 }, { "type": "loss", "content": 0.01718306355178356, "timestamp": "2025-09-30 22:12:05.434471", "step": 2320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:05.481672", "step": 2320, "epoch": 3 }, { "type": "loss", "content": 0.0016600488452240825, "timestamp": "2025-09-30 22:12:05.485877", "step": 2321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.518932", "step": 2321, "epoch": 3 }, { "type": "loss", "content": 0.0003560645563993603, "timestamp": "2025-09-30 22:12:05.522143", "step": 2322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.569894", "step": 2322, "epoch": 3 }, { "type": "loss", "content": 0.0010048066033050418, "timestamp": "2025-09-30 22:12:05.573432", "step": 2323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:05.624869", "step": 2323, "epoch": 3 }, { "type": "loss", "content": 0.010342958383262157, "timestamp": "2025-09-30 22:12:05.661338", "step": 2324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.703923", "step": 2324, "epoch": 3 }, { "type": "loss", "content": 0.0004138563817832619, "timestamp": "2025-09-30 22:12:05.707259", "step": 2325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:05.747256", "step": 2325, "epoch": 3 }, { "type": "loss", "content": 0.009076876565814018, "timestamp": "2025-09-30 22:12:05.758697", "step": 2326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.791111", "step": 2326, "epoch": 3 }, { "type": "loss", "content": 0.004802730865776539, "timestamp": "2025-09-30 22:12:05.794003", "step": 2327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.835394", "step": 2327, "epoch": 3 }, { "type": "loss", "content": 0.0006566385854966938, "timestamp": "2025-09-30 22:12:05.859710", "step": 2328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.897532", "step": 2328, "epoch": 3 }, { "type": "loss", "content": 0.00047095856280066073, "timestamp": "2025-09-30 22:12:05.900682", "step": 2329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:05.944301", "step": 2329, "epoch": 3 }, { "type": "loss", "content": 0.00011278774763923138, "timestamp": "2025-09-30 22:12:05.951644", "step": 2330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:05.985479", "step": 2330, "epoch": 3 }, { "type": "loss", "content": 0.0002812811580952257, "timestamp": "2025-09-30 22:12:05.993798", "step": 2331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:06.040965", "step": 2331, "epoch": 3 }, { "type": "loss", "content": 0.001078323693946004, "timestamp": "2025-09-30 22:12:06.065658", "step": 2332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:06.104599", "step": 2332, "epoch": 3 }, { "type": "loss", "content": 0.014645090326666832, "timestamp": "2025-09-30 22:12:06.108302", "step": 2333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:06.146369", "step": 2333, "epoch": 3 }, { "type": "loss", "content": 0.0006407131440937519, "timestamp": "2025-09-30 22:12:06.153793", "step": 2334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:06.196517", "step": 2334, "epoch": 3 }, { "type": "loss", "content": 0.005221139173954725, "timestamp": "2025-09-30 22:12:06.205904", "step": 2335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:06.245185", "step": 2335, "epoch": 3 }, { "type": "loss", "content": 0.013521954417228699, "timestamp": "2025-09-30 22:12:06.281742", "step": 2336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:06.313675", "step": 2336, "epoch": 3 }, { "type": "loss", "content": 0.005685387644916773, "timestamp": "2025-09-30 22:12:06.316262", "step": 2337, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:07.330929", "step": 2337, "epoch": 3 }, { "type": "pplx", "content": 46391372.17725871, "timestamp": "2025-09-30 22:12:07.336034", "step": 2337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.373369", "step": 2337, "epoch": 3 }, { "type": "loss", "content": 0.0001280421856790781, "timestamp": "2025-09-30 22:12:07.375905", "step": 2338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.413942", "step": 2338, "epoch": 3 }, { "type": "loss", "content": 0.0010907641844823956, "timestamp": "2025-09-30 22:12:07.421205", "step": 2339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.460438", "step": 2339, "epoch": 3 }, { "type": "loss", "content": 0.001054750056937337, "timestamp": "2025-09-30 22:12:07.489134", "step": 2340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:07.529870", "step": 2340, "epoch": 3 }, { "type": "loss", "content": 0.0010710905771702528, "timestamp": "2025-09-30 22:12:07.536172", "step": 2341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:07.586081", "step": 2341, "epoch": 3 }, { "type": "loss", "content": 0.0010237701935693622, "timestamp": "2025-09-30 22:12:07.589822", "step": 2342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.627960", "step": 2342, "epoch": 3 }, { "type": "loss", "content": 0.008467769250273705, "timestamp": "2025-09-30 22:12:07.631790", "step": 2343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.667145", "step": 2343, "epoch": 3 }, { "type": "loss", "content": 0.002808736404404044, "timestamp": "2025-09-30 22:12:07.691732", "step": 2344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.725161", "step": 2344, "epoch": 3 }, { "type": "loss", "content": 0.029646631330251694, "timestamp": "2025-09-30 22:12:07.729297", "step": 2345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:07.761797", "step": 2345, "epoch": 3 }, { "type": "loss", "content": 0.007430060766637325, "timestamp": "2025-09-30 22:12:07.765228", "step": 2346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.799199", "step": 2346, "epoch": 3 }, { "type": "loss", "content": 0.0001956993219209835, "timestamp": "2025-09-30 22:12:07.803655", "step": 2347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.836540", "step": 2347, "epoch": 3 }, { "type": "loss", "content": 0.0011663747718557715, "timestamp": "2025-09-30 22:12:07.861082", "step": 2348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:07.893631", "step": 2348, "epoch": 3 }, { "type": "loss", "content": 0.0004969866713508964, "timestamp": "2025-09-30 22:12:07.896893", "step": 2349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:07.943325", "step": 2349, "epoch": 3 }, { "type": "loss", "content": 0.0021070470102131367, "timestamp": "2025-09-30 22:12:07.945967", "step": 2350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:07.989762", "step": 2350, "epoch": 3 }, { "type": "loss", "content": 0.0004443576035555452, "timestamp": "2025-09-30 22:12:07.992864", "step": 2351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.032050", "step": 2351, "epoch": 3 }, { "type": "loss", "content": 0.0007625381113030016, "timestamp": "2025-09-30 22:12:08.056555", "step": 2352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.112245", "step": 2352, "epoch": 3 }, { "type": "loss", "content": 9.592527931090444e-05, "timestamp": "2025-09-30 22:12:08.119195", "step": 2353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.159404", "step": 2353, "epoch": 3 }, { "type": "loss", "content": 0.004538069479167461, "timestamp": "2025-09-30 22:12:08.162209", "step": 2354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.204106", "step": 2354, "epoch": 3 }, { "type": "loss", "content": 0.0006490609957836568, "timestamp": "2025-09-30 22:12:08.208849", "step": 2355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.254052", "step": 2355, "epoch": 3 }, { "type": "loss", "content": 0.002888438990339637, "timestamp": "2025-09-30 22:12:08.285560", "step": 2356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.330447", "step": 2356, "epoch": 3 }, { "type": "loss", "content": 0.00023478925868403167, "timestamp": "2025-09-30 22:12:08.342245", "step": 2357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.376392", "step": 2357, "epoch": 3 }, { "type": "loss", "content": 0.00019593130855355412, "timestamp": "2025-09-30 22:12:08.383505", "step": 2358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.416777", "step": 2358, "epoch": 3 }, { "type": "loss", "content": 0.00017743787611834705, "timestamp": "2025-09-30 22:12:08.421424", "step": 2359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.456841", "step": 2359, "epoch": 3 }, { "type": "loss", "content": 0.002296753926202655, "timestamp": "2025-09-30 22:12:08.492356", "step": 2360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.527590", "step": 2360, "epoch": 3 }, { "type": "loss", "content": 6.212801235960796e-05, "timestamp": "2025-09-30 22:12:08.532029", "step": 2361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.571372", "step": 2361, "epoch": 3 }, { "type": "loss", "content": 0.002575967228040099, "timestamp": "2025-09-30 22:12:08.574168", "step": 2362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.607047", "step": 2362, "epoch": 3 }, { "type": "loss", "content": 0.00018172072304878384, "timestamp": "2025-09-30 22:12:08.618408", "step": 2363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.654014", "step": 2363, "epoch": 3 }, { "type": "loss", "content": 0.0006860285648144782, "timestamp": "2025-09-30 22:12:08.688882", "step": 2364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.732977", "step": 2364, "epoch": 3 }, { "type": "loss", "content": 0.02065959945321083, "timestamp": "2025-09-30 22:12:08.736267", "step": 2365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.778800", "step": 2365, "epoch": 3 }, { "type": "loss", "content": 0.03529736027121544, "timestamp": "2025-09-30 22:12:08.782845", "step": 2366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.823676", "step": 2366, "epoch": 3 }, { "type": "loss", "content": 0.00047834464930929244, "timestamp": "2025-09-30 22:12:08.829550", "step": 2367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.865440", "step": 2367, "epoch": 3 }, { "type": "loss", "content": 0.031571704894304276, "timestamp": "2025-09-30 22:12:08.889965", "step": 2368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:08.925521", "step": 2368, "epoch": 3 }, { "type": "loss", "content": 8.10366400401108e-05, "timestamp": "2025-09-30 22:12:08.929028", "step": 2369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:08.973742", "step": 2369, "epoch": 3 }, { "type": "loss", "content": 0.00013628315355163068, "timestamp": "2025-09-30 22:12:08.977054", "step": 2370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.032106", "step": 2370, "epoch": 3 }, { "type": "loss", "content": 0.036459606140851974, "timestamp": "2025-09-30 22:12:09.036007", "step": 2371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.070983", "step": 2371, "epoch": 3 }, { "type": "loss", "content": 0.018056688830256462, "timestamp": "2025-09-30 22:12:09.106327", "step": 2372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.140407", "step": 2372, "epoch": 3 }, { "type": "loss", "content": 0.035005517303943634, "timestamp": "2025-09-30 22:12:09.145133", "step": 2373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.180499", "step": 2373, "epoch": 3 }, { "type": "loss", "content": 0.0010858295718207955, "timestamp": "2025-09-30 22:12:09.194325", "step": 2374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.230899", "step": 2374, "epoch": 3 }, { "type": "loss", "content": 0.0038253210950642824, "timestamp": "2025-09-30 22:12:09.234483", "step": 2375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.272120", "step": 2375, "epoch": 3 }, { "type": "loss", "content": 0.0023231101222336292, "timestamp": "2025-09-30 22:12:09.297811", "step": 2376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.332286", "step": 2376, "epoch": 3 }, { "type": "loss", "content": 0.0017414873000234365, "timestamp": "2025-09-30 22:12:09.334687", "step": 2377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.373795", "step": 2377, "epoch": 3 }, { "type": "loss", "content": 0.030804920941591263, "timestamp": "2025-09-30 22:12:09.387863", "step": 2378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.423964", "step": 2378, "epoch": 3 }, { "type": "loss", "content": 0.0013301247963681817, "timestamp": "2025-09-30 22:12:09.427559", "step": 2379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.461909", "step": 2379, "epoch": 3 }, { "type": "loss", "content": 0.0005260419566184282, "timestamp": "2025-09-30 22:12:09.485744", "step": 2380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.521679", "step": 2380, "epoch": 3 }, { "type": "loss", "content": 0.0029137025121599436, "timestamp": "2025-09-30 22:12:09.524875", "step": 2381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:09.562647", "step": 2381, "epoch": 3 }, { "type": "loss", "content": 0.0006266444106586277, "timestamp": "2025-09-30 22:12:09.568225", "step": 2382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.604472", "step": 2382, "epoch": 3 }, { "type": "loss", "content": 0.02861565351486206, "timestamp": "2025-09-30 22:12:09.608029", "step": 2383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.642994", "step": 2383, "epoch": 3 }, { "type": "loss", "content": 0.019098343327641487, "timestamp": "2025-09-30 22:12:09.668409", "step": 2384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.709435", "step": 2384, "epoch": 3 }, { "type": "loss", "content": 0.00022823250037617981, "timestamp": "2025-09-30 22:12:09.713273", "step": 2385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.749008", "step": 2385, "epoch": 3 }, { "type": "loss", "content": 0.02544480934739113, "timestamp": "2025-09-30 22:12:09.755914", "step": 2386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.802396", "step": 2386, "epoch": 3 }, { "type": "loss", "content": 0.001819111406803131, "timestamp": "2025-09-30 22:12:09.805901", "step": 2387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.840618", "step": 2387, "epoch": 3 }, { "type": "loss", "content": 0.016899127513170242, "timestamp": "2025-09-30 22:12:09.865412", "step": 2388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:09.905943", "step": 2388, "epoch": 3 }, { "type": "loss", "content": 0.0014039212837815285, "timestamp": "2025-09-30 22:12:09.909281", "step": 2389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:09.975526", "step": 2389, "epoch": 3 }, { "type": "loss", "content": 0.00031415908597409725, "timestamp": "2025-09-30 22:12:09.979228", "step": 2390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:10.015441", "step": 2390, "epoch": 3 }, { "type": "loss", "content": 0.0005526886088773608, "timestamp": "2025-09-30 22:12:10.018176", "step": 2391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:10.054781", "step": 2391, "epoch": 3 }, { "type": "loss", "content": 0.017070923000574112, "timestamp": "2025-09-30 22:12:10.079783", "step": 2392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:10.116338", "step": 2392, "epoch": 3 }, { "type": "loss", "content": 0.0017836365150287747, "timestamp": "2025-09-30 22:12:10.120300", "step": 2393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:10.160381", "step": 2393, "epoch": 3 }, { "type": "loss", "content": 0.013684429228305817, "timestamp": "2025-09-30 22:12:10.163909", "step": 2394, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:11.274444", "step": 2394, "epoch": 3 }, { "type": "pplx", "content": 65413116.89364355, "timestamp": "2025-09-30 22:12:11.287171", "step": 2394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.326687", "step": 2394, "epoch": 3 }, { "type": "loss", "content": 0.014189009554684162, "timestamp": "2025-09-30 22:12:11.331378", "step": 2395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.390147", "step": 2395, "epoch": 3 }, { "type": "loss", "content": 0.00021919552818872035, "timestamp": "2025-09-30 22:12:11.422375", "step": 2396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.470444", "step": 2396, "epoch": 3 }, { "type": "loss", "content": 0.0008827035198919475, "timestamp": "2025-09-30 22:12:11.473912", "step": 2397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.519495", "step": 2397, "epoch": 3 }, { "type": "loss", "content": 0.030242696404457092, "timestamp": "2025-09-30 22:12:11.524313", "step": 2398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:11.557997", "step": 2398, "epoch": 3 }, { "type": "loss", "content": 0.013079182244837284, "timestamp": "2025-09-30 22:12:11.562352", "step": 2399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.602505", "step": 2399, "epoch": 3 }, { "type": "loss", "content": 0.00754732359200716, "timestamp": "2025-09-30 22:12:11.633917", "step": 2400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.667168", "step": 2400, "epoch": 3 }, { "type": "loss", "content": 0.0012320360401645303, "timestamp": "2025-09-30 22:12:11.670956", "step": 2401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:11.704264", "step": 2401, "epoch": 3 }, { "type": "loss", "content": 0.026908395811915398, "timestamp": "2025-09-30 22:12:11.710053", "step": 2402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.742404", "step": 2402, "epoch": 3 }, { "type": "loss", "content": 0.0063280873000621796, "timestamp": "2025-09-30 22:12:11.744904", "step": 2403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.781457", "step": 2403, "epoch": 3 }, { "type": "loss", "content": 0.0008967557223513722, "timestamp": "2025-09-30 22:12:11.805821", "step": 2404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.842617", "step": 2404, "epoch": 3 }, { "type": "loss", "content": 0.0037160839419811964, "timestamp": "2025-09-30 22:12:11.848816", "step": 2405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.881560", "step": 2405, "epoch": 3 }, { "type": "loss", "content": 0.0011413343017920852, "timestamp": "2025-09-30 22:12:11.892778", "step": 2406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:11.926028", "step": 2406, "epoch": 3 }, { "type": "loss", "content": 0.024273769930005074, "timestamp": "2025-09-30 22:12:11.928620", "step": 2407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:11.983045", "step": 2407, "epoch": 3 }, { "type": "loss", "content": 0.0033409486059099436, "timestamp": "2025-09-30 22:12:12.008628", "step": 2408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.043425", "step": 2408, "epoch": 3 }, { "type": "loss", "content": 0.0031633952166885138, "timestamp": "2025-09-30 22:12:12.045444", "step": 2409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.082323", "step": 2409, "epoch": 3 }, { "type": "loss", "content": 0.006252944469451904, "timestamp": "2025-09-30 22:12:12.085624", "step": 2410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:12.119050", "step": 2410, "epoch": 3 }, { "type": "loss", "content": 0.008213681168854237, "timestamp": "2025-09-30 22:12:12.122300", "step": 2411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:12.162129", "step": 2411, "epoch": 3 }, { "type": "loss", "content": 0.010081358253955841, "timestamp": "2025-09-30 22:12:12.186003", "step": 2412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.222342", "step": 2412, "epoch": 3 }, { "type": "loss", "content": 0.016692589968442917, "timestamp": "2025-09-30 22:12:12.232519", "step": 2413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.270128", "step": 2413, "epoch": 3 }, { "type": "loss", "content": 0.007243197411298752, "timestamp": "2025-09-30 22:12:12.273893", "step": 2414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:12.333563", "step": 2414, "epoch": 3 }, { "type": "loss", "content": 0.002702240599319339, "timestamp": "2025-09-30 22:12:12.348304", "step": 2415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.384547", "step": 2415, "epoch": 3 }, { "type": "loss", "content": 0.0019787922501564026, "timestamp": "2025-09-30 22:12:12.409192", "step": 2416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:12.445124", "step": 2416, "epoch": 3 }, { "type": "loss", "content": 0.006027872674167156, "timestamp": "2025-09-30 22:12:12.449558", "step": 2417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:12.490956", "step": 2417, "epoch": 3 }, { "type": "loss", "content": 0.008715354837477207, "timestamp": "2025-09-30 22:12:12.494453", "step": 2418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.527345", "step": 2418, "epoch": 3 }, { "type": "loss", "content": 0.009561752900481224, "timestamp": "2025-09-30 22:12:12.530277", "step": 2419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.580919", "step": 2419, "epoch": 3 }, { "type": "loss", "content": 0.0023871201556175947, "timestamp": "2025-09-30 22:12:12.605637", "step": 2420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.637007", "step": 2420, "epoch": 3 }, { "type": "loss", "content": 0.00500152911990881, "timestamp": "2025-09-30 22:12:12.639971", "step": 2421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:12.681120", "step": 2421, "epoch": 3 }, { "type": "loss", "content": 0.012087523005902767, "timestamp": "2025-09-30 22:12:12.684477", "step": 2422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.727338", "step": 2422, "epoch": 3 }, { "type": "loss", "content": 0.0017976844683289528, "timestamp": "2025-09-30 22:12:12.731067", "step": 2423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.780306", "step": 2423, "epoch": 3 }, { "type": "loss", "content": 0.013470481149852276, "timestamp": "2025-09-30 22:12:12.804491", "step": 2424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-09-30 22:12:12.840920", "step": 2424, "epoch": 3 }, { "type": "loss", "content": 0.0020187811460345984, "timestamp": "2025-09-30 22:12:12.843628", "step": 2425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.873849", "step": 2425, "epoch": 3 }, { "type": "loss", "content": 0.004044502507895231, "timestamp": "2025-09-30 22:12:12.876983", "step": 2426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:12.909004", "step": 2426, "epoch": 3 }, { "type": "loss", "content": 0.019353091716766357, "timestamp": "2025-09-30 22:12:12.915834", "step": 2427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:12.958545", "step": 2427, "epoch": 3 }, { "type": "loss", "content": 0.0004785063210874796, "timestamp": "2025-09-30 22:12:12.990788", "step": 2428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.044761", "step": 2428, "epoch": 3 }, { "type": "loss", "content": 0.006889610085636377, "timestamp": "2025-09-30 22:12:13.058763", "step": 2429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.118812", "step": 2429, "epoch": 3 }, { "type": "loss", "content": 0.004911174066364765, "timestamp": "2025-09-30 22:12:13.123717", "step": 2430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:13.171498", "step": 2430, "epoch": 3 }, { "type": "loss", "content": 0.0064652832224965096, "timestamp": "2025-09-30 22:12:13.178647", "step": 2431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.223671", "step": 2431, "epoch": 3 }, { "type": "loss", "content": 0.003408350283280015, "timestamp": "2025-09-30 22:12:13.249700", "step": 2432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:13.286262", "step": 2432, "epoch": 3 }, { "type": "loss", "content": 0.025650396943092346, "timestamp": "2025-09-30 22:12:13.288935", "step": 2433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.323316", "step": 2433, "epoch": 3 }, { "type": "loss", "content": 0.0033151302486658096, "timestamp": "2025-09-30 22:12:13.326983", "step": 2434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.366555", "step": 2434, "epoch": 3 }, { "type": "loss", "content": 0.010545426048338413, "timestamp": "2025-09-30 22:12:13.370016", "step": 2435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.401499", "step": 2435, "epoch": 3 }, { "type": "loss", "content": 0.011878552846610546, "timestamp": "2025-09-30 22:12:13.425534", "step": 2436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.464542", "step": 2436, "epoch": 3 }, { "type": "loss", "content": 0.0001749217917677015, "timestamp": "2025-09-30 22:12:13.467762", "step": 2437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.509444", "step": 2437, "epoch": 3 }, { "type": "loss", "content": 0.04364948347210884, "timestamp": "2025-09-30 22:12:13.512321", "step": 2438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.554957", "step": 2438, "epoch": 3 }, { "type": "loss", "content": 0.003104813862591982, "timestamp": "2025-09-30 22:12:13.557897", "step": 2439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:13.595618", "step": 2439, "epoch": 3 }, { "type": "loss", "content": 0.0048273103311657906, "timestamp": "2025-09-30 22:12:13.623372", "step": 2440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.675134", "step": 2440, "epoch": 3 }, { "type": "loss", "content": 0.005417822860181332, "timestamp": "2025-09-30 22:12:13.682437", "step": 2441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:13.717391", "step": 2441, "epoch": 3 }, { "type": "loss", "content": 0.011303612031042576, "timestamp": "2025-09-30 22:12:13.719797", "step": 2442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.754682", "step": 2442, "epoch": 3 }, { "type": "loss", "content": 0.0019748841878026724, "timestamp": "2025-09-30 22:12:13.756935", "step": 2443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:13.797829", "step": 2443, "epoch": 3 }, { "type": "loss", "content": 0.0004670954658649862, "timestamp": "2025-09-30 22:12:13.822910", "step": 2444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.867396", "step": 2444, "epoch": 3 }, { "type": "loss", "content": 0.0493459478020668, "timestamp": "2025-09-30 22:12:13.871818", "step": 2445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.915025", "step": 2445, "epoch": 3 }, { "type": "loss", "content": 0.0002415820927126333, "timestamp": "2025-09-30 22:12:13.917755", "step": 2446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:13.971422", "step": 2446, "epoch": 3 }, { "type": "loss", "content": 0.0018418595427647233, "timestamp": "2025-09-30 22:12:13.988199", "step": 2447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:14.023598", "step": 2447, "epoch": 3 }, { "type": "loss", "content": 0.018292101100087166, "timestamp": "2025-09-30 22:12:14.049047", "step": 2448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:14.121049", "step": 2448, "epoch": 3 }, { "type": "loss", "content": 0.01522571686655283, "timestamp": "2025-09-30 22:12:14.124100", "step": 2449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:14.157646", "step": 2449, "epoch": 3 }, { "type": "loss", "content": 0.0033422100823372602, "timestamp": "2025-09-30 22:12:14.161885", "step": 2450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:14.195614", "step": 2450, "epoch": 3 }, { "type": "loss", "content": 0.002138448180630803, "timestamp": "2025-09-30 22:12:14.210631", "step": 2451, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:15.179053", "step": 2451, "epoch": 3 }, { "type": "pplx", "content": 62585234.83673584, "timestamp": "2025-09-30 22:12:15.183192", "step": 2451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.222614", "step": 2451, "epoch": 3 }, { "type": "loss", "content": 0.036872122436761856, "timestamp": "2025-09-30 22:12:15.247415", "step": 2452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.281654", "step": 2452, "epoch": 3 }, { "type": "loss", "content": 0.0013359521981328726, "timestamp": "2025-09-30 22:12:15.286305", "step": 2453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.320155", "step": 2453, "epoch": 3 }, { "type": "loss", "content": 0.0018873109947890043, "timestamp": "2025-09-30 22:12:15.322950", "step": 2454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:15.366238", "step": 2454, "epoch": 3 }, { "type": "loss", "content": 0.028096193447709084, "timestamp": "2025-09-30 22:12:15.374875", "step": 2455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:15.419954", "step": 2455, "epoch": 3 }, { "type": "loss", "content": 0.00184065627399832, "timestamp": "2025-09-30 22:12:15.444890", "step": 2456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.478246", "step": 2456, "epoch": 3 }, { "type": "loss", "content": 0.0008506966987624764, "timestamp": "2025-09-30 22:12:15.483383", "step": 2457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.521614", "step": 2457, "epoch": 3 }, { "type": "loss", "content": 0.004837132524698973, "timestamp": "2025-09-30 22:12:15.524898", "step": 2458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.559790", "step": 2458, "epoch": 3 }, { "type": "loss", "content": 0.011527560651302338, "timestamp": "2025-09-30 22:12:15.563171", "step": 2459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.602170", "step": 2459, "epoch": 3 }, { "type": "loss", "content": 0.002231266815215349, "timestamp": "2025-09-30 22:12:15.626342", "step": 2460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.660092", "step": 2460, "epoch": 3 }, { "type": "loss", "content": 0.009949088096618652, "timestamp": "2025-09-30 22:12:15.664775", "step": 2461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.702591", "step": 2461, "epoch": 3 }, { "type": "loss", "content": 0.016047203913331032, "timestamp": "2025-09-30 22:12:15.705987", "step": 2462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.761061", "step": 2462, "epoch": 3 }, { "type": "loss", "content": 0.03306948393583298, "timestamp": "2025-09-30 22:12:15.764529", "step": 2463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.796586", "step": 2463, "epoch": 3 }, { "type": "loss", "content": 0.011734615080058575, "timestamp": "2025-09-30 22:12:15.821966", "step": 2464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.856787", "step": 2464, "epoch": 3 }, { "type": "loss", "content": 0.004299917258322239, "timestamp": "2025-09-30 22:12:15.859498", "step": 2465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:15.904285", "step": 2465, "epoch": 3 }, { "type": "loss", "content": 0.0013004597276449203, "timestamp": "2025-09-30 22:12:15.913300", "step": 2466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.957918", "step": 2466, "epoch": 3 }, { "type": "loss", "content": 0.007806502282619476, "timestamp": "2025-09-30 22:12:15.960512", "step": 2467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:15.993666", "step": 2467, "epoch": 3 }, { "type": "loss", "content": 0.05851675942540169, "timestamp": "2025-09-30 22:12:16.018333", "step": 2468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.064242", "step": 2468, "epoch": 3 }, { "type": "loss", "content": 0.0025632530450820923, "timestamp": "2025-09-30 22:12:16.067653", "step": 2469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.107226", "step": 2469, "epoch": 3 }, { "type": "loss", "content": 0.02141547203063965, "timestamp": "2025-09-30 22:12:16.110708", "step": 2470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.151475", "step": 2470, "epoch": 3 }, { "type": "loss", "content": 0.0010793081019073725, "timestamp": "2025-09-30 22:12:16.155860", "step": 2471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.190306", "step": 2471, "epoch": 3 }, { "type": "loss", "content": 0.0025140002835541964, "timestamp": "2025-09-30 22:12:16.214536", "step": 2472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.265471", "step": 2472, "epoch": 3 }, { "type": "loss", "content": 0.0012594551080837846, "timestamp": "2025-09-30 22:12:16.268933", "step": 2473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.308928", "step": 2473, "epoch": 3 }, { "type": "loss", "content": 0.015506746247410774, "timestamp": "2025-09-30 22:12:16.311797", "step": 2474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.346452", "step": 2474, "epoch": 3 }, { "type": "loss", "content": 0.00189836451318115, "timestamp": "2025-09-30 22:12:16.350378", "step": 2475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.391136", "step": 2475, "epoch": 3 }, { "type": "loss", "content": 0.0021572846453636885, "timestamp": "2025-09-30 22:12:16.422709", "step": 2476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.463272", "step": 2476, "epoch": 3 }, { "type": "loss", "content": 0.0027588761877268553, "timestamp": "2025-09-30 22:12:16.466118", "step": 2477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.503079", "step": 2477, "epoch": 3 }, { "type": "loss", "content": 0.008273358456790447, "timestamp": "2025-09-30 22:12:16.506913", "step": 2478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.541526", "step": 2478, "epoch": 3 }, { "type": "loss", "content": 0.003073727013543248, "timestamp": "2025-09-30 22:12:16.552618", "step": 2479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.586838", "step": 2479, "epoch": 3 }, { "type": "loss", "content": 0.00348069379106164, "timestamp": "2025-09-30 22:12:16.610849", "step": 2480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.657545", "step": 2480, "epoch": 3 }, { "type": "loss", "content": 0.011410152539610863, "timestamp": "2025-09-30 22:12:16.660386", "step": 2481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:16.727292", "step": 2481, "epoch": 3 }, { "type": "loss", "content": 0.0004028068215120584, "timestamp": "2025-09-30 22:12:16.731599", "step": 2482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:16.791057", "step": 2482, "epoch": 3 }, { "type": "loss", "content": 0.0011346134124323726, "timestamp": "2025-09-30 22:12:16.793551", "step": 2483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:16.827410", "step": 2483, "epoch": 3 }, { "type": "loss", "content": 0.0011217163410037756, "timestamp": "2025-09-30 22:12:16.852392", "step": 2484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.888043", "step": 2484, "epoch": 3 }, { "type": "loss", "content": 0.016155900433659554, "timestamp": "2025-09-30 22:12:16.891312", "step": 2485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.939331", "step": 2485, "epoch": 3 }, { "type": "loss", "content": 0.008874503895640373, "timestamp": "2025-09-30 22:12:16.943070", "step": 2486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:16.985134", "step": 2486, "epoch": 3 }, { "type": "loss", "content": 0.020130667835474014, "timestamp": "2025-09-30 22:12:16.988103", "step": 2487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.022044", "step": 2487, "epoch": 3 }, { "type": "loss", "content": 0.008245137520134449, "timestamp": "2025-09-30 22:12:17.053088", "step": 2488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:17.093042", "step": 2488, "epoch": 3 }, { "type": "loss", "content": 0.00684535875916481, "timestamp": "2025-09-30 22:12:17.097028", "step": 2489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.128858", "step": 2489, "epoch": 3 }, { "type": "loss", "content": 7.354576518991962e-05, "timestamp": "2025-09-30 22:12:17.131648", "step": 2490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.164087", "step": 2490, "epoch": 3 }, { "type": "loss", "content": 4.672312206821516e-05, "timestamp": "2025-09-30 22:12:17.168337", "step": 2491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:17.208240", "step": 2491, "epoch": 3 }, { "type": "loss", "content": 0.04931880906224251, "timestamp": "2025-09-30 22:12:17.232431", "step": 2492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:17.265929", "step": 2492, "epoch": 3 }, { "type": "loss", "content": 8.33717203931883e-05, "timestamp": "2025-09-30 22:12:17.269271", "step": 2493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.317146", "step": 2493, "epoch": 3 }, { "type": "loss", "content": 0.002100432524457574, "timestamp": "2025-09-30 22:12:17.319594", "step": 2494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.351844", "step": 2494, "epoch": 3 }, { "type": "loss", "content": 0.033802784979343414, "timestamp": "2025-09-30 22:12:17.355014", "step": 2495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.392473", "step": 2495, "epoch": 3 }, { "type": "loss", "content": 7.228619506349787e-05, "timestamp": "2025-09-30 22:12:17.416363", "step": 2496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.448412", "step": 2496, "epoch": 3 }, { "type": "loss", "content": 0.024196816608309746, "timestamp": "2025-09-30 22:12:17.451206", "step": 2497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:17.483674", "step": 2497, "epoch": 3 }, { "type": "loss", "content": 0.004435116890817881, "timestamp": "2025-09-30 22:12:17.494907", "step": 2498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.539375", "step": 2498, "epoch": 3 }, { "type": "loss", "content": 0.0020751405972987413, "timestamp": "2025-09-30 22:12:17.542827", "step": 2499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:17.580444", "step": 2499, "epoch": 3 }, { "type": "loss", "content": 0.00010092502634506673, "timestamp": "2025-09-30 22:12:17.607680", "step": 2500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2500", "timestamp": "2025-09-30 22:12:24.441468", "step": 2500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.476136", "step": 2500, "epoch": 3 }, { "type": "loss", "content": 0.00021722108067478985, "timestamp": "2025-09-30 22:12:24.479174", "step": 2501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.512768", "step": 2501, "epoch": 3 }, { "type": "loss", "content": 0.00021702356752939522, "timestamp": "2025-09-30 22:12:24.515547", "step": 2502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.552425", "step": 2502, "epoch": 3 }, { "type": "loss", "content": 0.03270643576979637, "timestamp": "2025-09-30 22:12:24.555294", "step": 2503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:24.588294", "step": 2503, "epoch": 3 }, { "type": "loss", "content": 0.033587705343961716, "timestamp": "2025-09-30 22:12:24.621200", "step": 2504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.657910", "step": 2504, "epoch": 3 }, { "type": "loss", "content": 0.0011009655427187681, "timestamp": "2025-09-30 22:12:24.661900", "step": 2505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.711655", "step": 2505, "epoch": 3 }, { "type": "loss", "content": 0.025243079289793968, "timestamp": "2025-09-30 22:12:24.714798", "step": 2506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.757544", "step": 2506, "epoch": 3 }, { "type": "loss", "content": 0.01099986769258976, "timestamp": "2025-09-30 22:12:24.768866", "step": 2507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:24.809829", "step": 2507, "epoch": 3 }, { "type": "loss", "content": 0.04144655540585518, "timestamp": "2025-09-30 22:12:24.836337", "step": 2508, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:25.877102", "step": 2508, "epoch": 3 }, { "type": "pplx", "content": 51440041.32039302, "timestamp": "2025-09-30 22:12:25.879619", "step": 2508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:25.913015", "step": 2508, "epoch": 3 }, { "type": "loss", "content": 0.05126844719052315, "timestamp": "2025-09-30 22:12:25.915528", "step": 2509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:25.954174", "step": 2509, "epoch": 3 }, { "type": "loss", "content": 0.0024910145439207554, "timestamp": "2025-09-30 22:12:25.964119", "step": 2510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.003353", "step": 2510, "epoch": 3 }, { "type": "loss", "content": 0.002371475100517273, "timestamp": "2025-09-30 22:12:26.007123", "step": 2511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.044932", "step": 2511, "epoch": 3 }, { "type": "loss", "content": 0.008505993522703648, "timestamp": "2025-09-30 22:12:26.068871", "step": 2512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.100655", "step": 2512, "epoch": 3 }, { "type": "loss", "content": 0.008775151334702969, "timestamp": "2025-09-30 22:12:26.103596", "step": 2513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.133666", "step": 2513, "epoch": 3 }, { "type": "loss", "content": 0.005994870793074369, "timestamp": "2025-09-30 22:12:26.135971", "step": 2514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:26.174131", "step": 2514, "epoch": 3 }, { "type": "loss", "content": 0.004314147401601076, "timestamp": "2025-09-30 22:12:26.177619", "step": 2515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:26.212159", "step": 2515, "epoch": 3 }, { "type": "loss", "content": 0.010322661139070988, "timestamp": "2025-09-30 22:12:26.237417", "step": 2516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.268834", "step": 2516, "epoch": 3 }, { "type": "loss", "content": 0.009751622565090656, "timestamp": "2025-09-30 22:12:26.271108", "step": 2517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.310414", "step": 2517, "epoch": 3 }, { "type": "loss", "content": 0.006905748508870602, "timestamp": "2025-09-30 22:12:26.316132", "step": 2518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.358662", "step": 2518, "epoch": 3 }, { "type": "loss", "content": 0.041554901748895645, "timestamp": "2025-09-30 22:12:26.370479", "step": 2519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.407451", "step": 2519, "epoch": 3 }, { "type": "loss", "content": 0.0019552961457520723, "timestamp": "2025-09-30 22:12:26.432159", "step": 2520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.477273", "step": 2520, "epoch": 3 }, { "type": "loss", "content": 0.00302482838742435, "timestamp": "2025-09-30 22:12:26.481855", "step": 2521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.519834", "step": 2521, "epoch": 3 }, { "type": "loss", "content": 0.0036011997144669294, "timestamp": "2025-09-30 22:12:26.522302", "step": 2522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.560096", "step": 2522, "epoch": 3 }, { "type": "loss", "content": 0.04953055456280708, "timestamp": "2025-09-30 22:12:26.562550", "step": 2523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.604060", "step": 2523, "epoch": 3 }, { "type": "loss", "content": 0.024661192670464516, "timestamp": "2025-09-30 22:12:26.632779", "step": 2524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.668836", "step": 2524, "epoch": 3 }, { "type": "loss", "content": 0.007575987372547388, "timestamp": "2025-09-30 22:12:26.673588", "step": 2525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.712052", "step": 2525, "epoch": 3 }, { "type": "loss", "content": 0.01171951089054346, "timestamp": "2025-09-30 22:12:26.714809", "step": 2526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.746519", "step": 2526, "epoch": 3 }, { "type": "loss", "content": 0.007814085111021996, "timestamp": "2025-09-30 22:12:26.749250", "step": 2527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:26.790855", "step": 2527, "epoch": 3 }, { "type": "loss", "content": 0.03643188998103142, "timestamp": "2025-09-30 22:12:26.814516", "step": 2528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.847222", "step": 2528, "epoch": 3 }, { "type": "loss", "content": 0.005677658133208752, "timestamp": "2025-09-30 22:12:26.850913", "step": 2529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:26.885157", "step": 2529, "epoch": 3 }, { "type": "loss", "content": 0.02060539647936821, "timestamp": "2025-09-30 22:12:26.890913", "step": 2530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.929792", "step": 2530, "epoch": 3 }, { "type": "loss", "content": 0.019244346767663956, "timestamp": "2025-09-30 22:12:26.935318", "step": 2531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:26.974780", "step": 2531, "epoch": 3 }, { "type": "loss", "content": 0.005939108785241842, "timestamp": "2025-09-30 22:12:27.001770", "step": 2532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:27.033678", "step": 2532, "epoch": 3 }, { "type": "loss", "content": 0.025780802592635155, "timestamp": "2025-09-30 22:12:27.037527", "step": 2533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:27.073435", "step": 2533, "epoch": 3 }, { "type": "loss", "content": 0.0057016233913600445, "timestamp": "2025-09-30 22:12:27.078239", "step": 2534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:27.111678", "step": 2534, "epoch": 3 }, { "type": "loss", "content": 0.005294333212077618, "timestamp": "2025-09-30 22:12:27.114140", "step": 2535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.144822", "step": 2535, "epoch": 3 }, { "type": "loss", "content": 0.018375219777226448, "timestamp": "2025-09-30 22:12:27.169702", "step": 2536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.200582", "step": 2536, "epoch": 3 }, { "type": "loss", "content": 0.014313088729977608, "timestamp": "2025-09-30 22:12:27.202832", "step": 2537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:27.243292", "step": 2537, "epoch": 3 }, { "type": "loss", "content": 0.0019038409227505326, "timestamp": "2025-09-30 22:12:27.252377", "step": 2538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:27.284146", "step": 2538, "epoch": 3 }, { "type": "loss", "content": 0.0079488605260849, "timestamp": "2025-09-30 22:12:27.289216", "step": 2539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:27.321077", "step": 2539, "epoch": 3 }, { "type": "loss", "content": 0.0012947311624884605, "timestamp": "2025-09-30 22:12:27.345027", "step": 2540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.379042", "step": 2540, "epoch": 3 }, { "type": "loss", "content": 0.00036997883580625057, "timestamp": "2025-09-30 22:12:27.381824", "step": 2541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.413535", "step": 2541, "epoch": 3 }, { "type": "loss", "content": 0.0030447926837950945, "timestamp": "2025-09-30 22:12:27.416146", "step": 2542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.449899", "step": 2542, "epoch": 3 }, { "type": "loss", "content": 0.0076310886070132256, "timestamp": "2025-09-30 22:12:27.452189", "step": 2543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.483123", "step": 2543, "epoch": 3 }, { "type": "loss", "content": 0.016252310946583748, "timestamp": "2025-09-30 22:12:27.509141", "step": 2544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:27.541918", "step": 2544, "epoch": 3 }, { "type": "loss", "content": 0.014082059264183044, "timestamp": "2025-09-30 22:12:27.545842", "step": 2545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.582115", "step": 2545, "epoch": 3 }, { "type": "loss", "content": 0.016975652426481247, "timestamp": "2025-09-30 22:12:27.585641", "step": 2546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.629840", "step": 2546, "epoch": 3 }, { "type": "loss", "content": 0.004181718919426203, "timestamp": "2025-09-30 22:12:27.632924", "step": 2547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.664508", "step": 2547, "epoch": 3 }, { "type": "loss", "content": 0.001995036145672202, "timestamp": "2025-09-30 22:12:27.688987", "step": 2548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:27.727040", "step": 2548, "epoch": 3 }, { "type": "loss", "content": 0.020780479535460472, "timestamp": "2025-09-30 22:12:27.730356", "step": 2549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.765703", "step": 2549, "epoch": 3 }, { "type": "loss", "content": 0.013351112604141235, "timestamp": "2025-09-30 22:12:27.769747", "step": 2550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.803467", "step": 2550, "epoch": 3 }, { "type": "loss", "content": 0.008309331722557545, "timestamp": "2025-09-30 22:12:27.809594", "step": 2551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.842365", "step": 2551, "epoch": 3 }, { "type": "loss", "content": 0.007991375401616096, "timestamp": "2025-09-30 22:12:27.866581", "step": 2552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.901678", "step": 2552, "epoch": 3 }, { "type": "loss", "content": 0.0011701161274686456, "timestamp": "2025-09-30 22:12:27.903831", "step": 2553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:27.935098", "step": 2553, "epoch": 3 }, { "type": "loss", "content": 0.03201867640018463, "timestamp": "2025-09-30 22:12:27.937765", "step": 2554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:27.972125", "step": 2554, "epoch": 3 }, { "type": "loss", "content": 0.0001384599308948964, "timestamp": "2025-09-30 22:12:27.975342", "step": 2555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.007132", "step": 2555, "epoch": 3 }, { "type": "loss", "content": 0.00021487221238203347, "timestamp": "2025-09-30 22:12:28.030624", "step": 2556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.061356", "step": 2556, "epoch": 3 }, { "type": "loss", "content": 0.00542490417137742, "timestamp": "2025-09-30 22:12:28.063865", "step": 2557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.094687", "step": 2557, "epoch": 3 }, { "type": "loss", "content": 0.01566317304968834, "timestamp": "2025-09-30 22:12:28.096878", "step": 2558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.133186", "step": 2558, "epoch": 3 }, { "type": "loss", "content": 0.03404511138796806, "timestamp": "2025-09-30 22:12:28.136052", "step": 2559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:28.168448", "step": 2559, "epoch": 3 }, { "type": "loss", "content": 0.0022068375255912542, "timestamp": "2025-09-30 22:12:28.192768", "step": 2560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:28.226671", "step": 2560, "epoch": 3 }, { "type": "loss", "content": 0.011201408691704273, "timestamp": "2025-09-30 22:12:28.232748", "step": 2561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.269849", "step": 2561, "epoch": 3 }, { "type": "loss", "content": 0.001101975911296904, "timestamp": "2025-09-30 22:12:28.273520", "step": 2562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.305675", "step": 2562, "epoch": 3 }, { "type": "loss", "content": 0.013607998378574848, "timestamp": "2025-09-30 22:12:28.309824", "step": 2563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.342934", "step": 2563, "epoch": 3 }, { "type": "loss", "content": 0.03017262928187847, "timestamp": "2025-09-30 22:12:28.368890", "step": 2564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:28.402515", "step": 2564, "epoch": 3 }, { "type": "loss", "content": 0.0012094740523025393, "timestamp": "2025-09-30 22:12:28.408818", "step": 2565, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:29.366942", "step": 2565, "epoch": 3 }, { "type": "pplx", "content": 39231890.14571114, "timestamp": "2025-09-30 22:12:29.376737", "step": 2565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.409321", "step": 2565, "epoch": 3 }, { "type": "loss", "content": 0.014957462437450886, "timestamp": "2025-09-30 22:12:29.411859", "step": 2566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:29.448095", "step": 2566, "epoch": 3 }, { "type": "loss", "content": 0.004640209022909403, "timestamp": "2025-09-30 22:12:29.454261", "step": 2567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:29.496575", "step": 2567, "epoch": 3 }, { "type": "loss", "content": 0.004045676905661821, "timestamp": "2025-09-30 22:12:29.521337", "step": 2568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:29.558966", "step": 2568, "epoch": 3 }, { "type": "loss", "content": 0.002006492344662547, "timestamp": "2025-09-30 22:12:29.564906", "step": 2569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.610682", "step": 2569, "epoch": 3 }, { "type": "loss", "content": 0.014200160279870033, "timestamp": "2025-09-30 22:12:29.613197", "step": 2570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.645926", "step": 2570, "epoch": 3 }, { "type": "loss", "content": 0.003748588962480426, "timestamp": "2025-09-30 22:12:29.652741", "step": 2571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.686228", "step": 2571, "epoch": 3 }, { "type": "loss", "content": 0.011747321113944054, "timestamp": "2025-09-30 22:12:29.710794", "step": 2572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.755916", "step": 2572, "epoch": 3 }, { "type": "loss", "content": 0.0015971793327480555, "timestamp": "2025-09-30 22:12:29.758470", "step": 2573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.792048", "step": 2573, "epoch": 3 }, { "type": "loss", "content": 0.010424541309475899, "timestamp": "2025-09-30 22:12:29.795357", "step": 2574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:29.834368", "step": 2574, "epoch": 3 }, { "type": "loss", "content": 0.0025860874447971582, "timestamp": "2025-09-30 22:12:29.837329", "step": 2575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:29.872233", "step": 2575, "epoch": 3 }, { "type": "loss", "content": 0.0033157693687826395, "timestamp": "2025-09-30 22:12:29.897788", "step": 2576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:29.930678", "step": 2576, "epoch": 3 }, { "type": "loss", "content": 0.002955112373456359, "timestamp": "2025-09-30 22:12:29.934228", "step": 2577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:29.982088", "step": 2577, "epoch": 3 }, { "type": "loss", "content": 0.0027001777198165655, "timestamp": "2025-09-30 22:12:29.990385", "step": 2578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.033243", "step": 2578, "epoch": 3 }, { "type": "loss", "content": 0.004555149935185909, "timestamp": "2025-09-30 22:12:30.036195", "step": 2579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.078336", "step": 2579, "epoch": 3 }, { "type": "loss", "content": 0.00765460729598999, "timestamp": "2025-09-30 22:12:30.104552", "step": 2580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.136627", "step": 2580, "epoch": 3 }, { "type": "loss", "content": 0.001375475199893117, "timestamp": "2025-09-30 22:12:30.141326", "step": 2581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.186922", "step": 2581, "epoch": 3 }, { "type": "loss", "content": 0.004311113618314266, "timestamp": "2025-09-30 22:12:30.189913", "step": 2582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.230673", "step": 2582, "epoch": 3 }, { "type": "loss", "content": 0.001586766797117889, "timestamp": "2025-09-30 22:12:30.233034", "step": 2583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.264135", "step": 2583, "epoch": 3 }, { "type": "loss", "content": 0.0055139511823654175, "timestamp": "2025-09-30 22:12:30.289063", "step": 2584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.325416", "step": 2584, "epoch": 3 }, { "type": "loss", "content": 0.0027511361986398697, "timestamp": "2025-09-30 22:12:30.329234", "step": 2585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.360416", "step": 2585, "epoch": 3 }, { "type": "loss", "content": 0.03730406612157822, "timestamp": "2025-09-30 22:12:30.362790", "step": 2586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.395216", "step": 2586, "epoch": 3 }, { "type": "loss", "content": 0.007691751234233379, "timestamp": "2025-09-30 22:12:30.402560", "step": 2587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:30.438793", "step": 2587, "epoch": 3 }, { "type": "loss", "content": 0.009390302933752537, "timestamp": "2025-09-30 22:12:30.462611", "step": 2588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:30.500825", "step": 2588, "epoch": 3 }, { "type": "loss", "content": 0.00674975011497736, "timestamp": "2025-09-30 22:12:30.503073", "step": 2589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.540795", "step": 2589, "epoch": 3 }, { "type": "loss", "content": 0.007475547958165407, "timestamp": "2025-09-30 22:12:30.544910", "step": 2590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.587056", "step": 2590, "epoch": 3 }, { "type": "loss", "content": 0.012665103189647198, "timestamp": "2025-09-30 22:12:30.591335", "step": 2591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.627082", "step": 2591, "epoch": 3 }, { "type": "loss", "content": 0.0015189863042905927, "timestamp": "2025-09-30 22:12:30.652721", "step": 2592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:30.691876", "step": 2592, "epoch": 3 }, { "type": "loss", "content": 0.0010610331082716584, "timestamp": "2025-09-30 22:12:30.701017", "step": 2593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.736087", "step": 2593, "epoch": 3 }, { "type": "loss", "content": 0.0031566577963531017, "timestamp": "2025-09-30 22:12:30.739780", "step": 2594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.779880", "step": 2594, "epoch": 3 }, { "type": "loss", "content": 0.008387443609535694, "timestamp": "2025-09-30 22:12:30.782575", "step": 2595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-09-30 22:12:30.830165", "step": 2595, "epoch": 3 }, { "type": "loss", "content": 0.022871583700180054, "timestamp": "2025-09-30 22:12:30.858927", "step": 2596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.893996", "step": 2596, "epoch": 3 }, { "type": "loss", "content": 0.004506740719079971, "timestamp": "2025-09-30 22:12:30.896576", "step": 2597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:30.949635", "step": 2597, "epoch": 3 }, { "type": "loss", "content": 0.0055312542244791985, "timestamp": "2025-09-30 22:12:30.954854", "step": 2598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.003512", "step": 2598, "epoch": 3 }, { "type": "loss", "content": 0.009669343009591103, "timestamp": "2025-09-30 22:12:31.007371", "step": 2599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.040506", "step": 2599, "epoch": 3 }, { "type": "loss", "content": 0.00065530592110008, "timestamp": "2025-09-30 22:12:31.065660", "step": 2600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.101798", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.0021030474454164505, "timestamp": "2025-09-30 22:12:31.104441", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.138107", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.0021827437449246645, "timestamp": "2025-09-30 22:12:31.140443", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.178379", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.0010505650425329804, "timestamp": "2025-09-30 22:12:31.180654", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.215997", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.015516921877861023, "timestamp": "2025-09-30 22:12:31.239625", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:31.276673", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.006490841507911682, "timestamp": "2025-09-30 22:12:31.279848", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:31.315663", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.0006698822253383696, "timestamp": "2025-09-30 22:12:31.318399", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.353069", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.02557486668229103, "timestamp": "2025-09-30 22:12:31.355880", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.394065", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.003963562194257975, "timestamp": "2025-09-30 22:12:31.421702", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:31.455888", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.00249542691744864, "timestamp": "2025-09-30 22:12:31.460311", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:31.493897", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.002343180123716593, "timestamp": "2025-09-30 22:12:31.496673", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:31.542619", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.002632777439430356, "timestamp": "2025-09-30 22:12:31.545298", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.579660", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.00020765874069184065, "timestamp": "2025-09-30 22:12:31.603546", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:31.636516", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.0328451506793499, "timestamp": "2025-09-30 22:12:31.639116", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.675980", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.0003175671736244112, "timestamp": "2025-09-30 22:12:31.678731", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.721871", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.0012592887505888939, "timestamp": "2025-09-30 22:12:31.727277", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:31.769963", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.0013843054184690118, "timestamp": "2025-09-30 22:12:31.796074", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.829063", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.0089047159999609, "timestamp": "2025-09-30 22:12:31.832200", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:31.871034", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.0008639580337330699, "timestamp": "2025-09-30 22:12:31.874295", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:31.908469", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.00013822069740854204, "timestamp": "2025-09-30 22:12:31.911191", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:31.951390", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 6.882520392537117e-05, "timestamp": "2025-09-30 22:12:31.976319", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:32.013452", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 4.944354441249743e-05, "timestamp": "2025-09-30 22:12:32.016223", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:32.051172", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.008317261002957821, "timestamp": "2025-09-30 22:12:32.061533", "step": 2622, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:33.118249", "step": 2622, "epoch": 3 }, { "type": "pplx", "content": 46235497.62527117, "timestamp": "2025-09-30 22:12:33.120941", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.156231", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.003576470073312521, "timestamp": "2025-09-30 22:12:33.159119", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:33.207386", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 6.79981749271974e-05, "timestamp": "2025-09-30 22:12:33.231660", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.273982", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.027295293286442757, "timestamp": "2025-09-30 22:12:33.282084", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.314005", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.0028682462871074677, "timestamp": "2025-09-30 22:12:33.322254", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.365089", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.0009013204253278673, "timestamp": "2025-09-30 22:12:33.368126", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.411800", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.026948420330882072, "timestamp": "2025-09-30 22:12:33.438808", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.478169", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.00045880835386924446, "timestamp": "2025-09-30 22:12:33.485131", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.519423", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.000256069382885471, "timestamp": "2025-09-30 22:12:33.522257", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.554349", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.0002860224631149322, "timestamp": "2025-09-30 22:12:33.557109", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:33.593198", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.023285450413823128, "timestamp": "2025-09-30 22:12:33.617735", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.652399", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.0002503176510799676, "timestamp": "2025-09-30 22:12:33.656684", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.696388", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.0053505441173911095, "timestamp": "2025-09-30 22:12:33.705582", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.741232", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.0028885682113468647, "timestamp": "2025-09-30 22:12:33.752920", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:33.796115", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.035982340574264526, "timestamp": "2025-09-30 22:12:33.822390", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.858001", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.009500211104750633, "timestamp": "2025-09-30 22:12:33.870511", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.905799", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.0003542073245625943, "timestamp": "2025-09-30 22:12:33.916748", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:33.963423", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.03855966776609421, "timestamp": "2025-09-30 22:12:33.967001", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.003623", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.01165574137121439, "timestamp": "2025-09-30 22:12:34.032569", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.070451", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.0001282426674151793, "timestamp": "2025-09-30 22:12:34.082109", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.130320", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.0003186077228747308, "timestamp": "2025-09-30 22:12:34.133394", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:34.173089", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.0013501204084604979, "timestamp": "2025-09-30 22:12:34.181624", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:34.217404", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.0008143013110384345, "timestamp": "2025-09-30 22:12:34.242369", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.287009", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.0003296361246611923, "timestamp": "2025-09-30 22:12:34.289451", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.337306", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.0003478021826595068, "timestamp": "2025-09-30 22:12:34.339980", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.373991", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.02839597873389721, "timestamp": "2025-09-30 22:12:34.376996", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.415319", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.0025328085757791996, "timestamp": "2025-09-30 22:12:34.439917", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.487858", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.0014561775606125593, "timestamp": "2025-09-30 22:12:34.500447", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.539997", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.00031854381086304784, "timestamp": "2025-09-30 22:12:34.545509", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.595679", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.00034756778040900826, "timestamp": "2025-09-30 22:12:34.604222", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.641346", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.00027000525733456016, "timestamp": "2025-09-30 22:12:34.667001", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.707961", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.00014926944277249277, "timestamp": "2025-09-30 22:12:34.714366", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:34.752646", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.00026244850596413016, "timestamp": "2025-09-30 22:12:34.754881", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:34.793236", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.001408580457791686, "timestamp": "2025-09-30 22:12:34.795807", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.827955", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.0016296766698360443, "timestamp": "2025-09-30 22:12:34.852459", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:34.887341", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.00039507076144218445, "timestamp": "2025-09-30 22:12:34.890633", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:34.929812", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.0014382427325472236, "timestamp": "2025-09-30 22:12:34.932184", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:34.971565", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.000742629577871412, "timestamp": "2025-09-30 22:12:34.974106", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.010196", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.0011206923518329859, "timestamp": "2025-09-30 22:12:35.041088", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:35.076357", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.0002413646870991215, "timestamp": "2025-09-30 22:12:35.079435", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:35.112612", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.0026289846282452345, "timestamp": "2025-09-30 22:12:35.116867", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.158701", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.004568056203424931, "timestamp": "2025-09-30 22:12:35.162892", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.196591", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.011687947437167168, "timestamp": "2025-09-30 22:12:35.221743", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.269486", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.013851391151547432, "timestamp": "2025-09-30 22:12:35.281776", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.317418", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.000755517918150872, "timestamp": "2025-09-30 22:12:35.324859", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.367629", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.0006584663642570376, "timestamp": "2025-09-30 22:12:35.370139", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:35.406241", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.004880126100033522, "timestamp": "2025-09-30 22:12:35.434564", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:35.479084", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 0.011352315545082092, "timestamp": "2025-09-30 22:12:35.481569", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.514422", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.00172287633176893, "timestamp": "2025-09-30 22:12:35.525366", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:35.558477", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.0005279943579807878, "timestamp": "2025-09-30 22:12:35.561361", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.594845", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.0005277348100207746, "timestamp": "2025-09-30 22:12:35.619232", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.652163", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.0016348527278751135, "timestamp": "2025-09-30 22:12:35.656803", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:35.688510", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.0004556818457785994, "timestamp": "2025-09-30 22:12:35.691923", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.725413", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.00033512728987261653, "timestamp": "2025-09-30 22:12:35.731295", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.764672", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.0003525232896208763, "timestamp": "2025-09-30 22:12:35.789130", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.840443", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.0004273319209460169, "timestamp": "2025-09-30 22:12:35.846633", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:35.883671", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.00047996934154070914, "timestamp": "2025-09-30 22:12:35.886690", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:35.921593", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.00013553262397181243, "timestamp": "2025-09-30 22:12:35.925247", "step": 2679, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:36.913562", "step": 2679, "epoch": 3 }, { "type": "pplx", "content": 42617958.04798346, "timestamp": "2025-09-30 22:12:36.917864", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:36.949128", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 0.00010548696445766836, "timestamp": "2025-09-30 22:12:36.973270", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.007122", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.0002033339551417157, "timestamp": "2025-09-30 22:12:37.011913", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:37.051670", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.0023126639425754547, "timestamp": "2025-09-30 22:12:37.054087", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:37.088537", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.00015263084787875414, "timestamp": "2025-09-30 22:12:37.091494", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.123744", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 0.00012648347183130682, "timestamp": "2025-09-30 22:12:37.148056", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.190144", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 0.00020340543414931744, "timestamp": "2025-09-30 22:12:37.201276", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:37.235095", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.00026868985150940716, "timestamp": "2025-09-30 22:12:37.237911", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.270419", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.0008745346567593515, "timestamp": "2025-09-30 22:12:37.276740", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:37.312828", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.0003780484548769891, "timestamp": "2025-09-30 22:12:37.337304", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.369820", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 0.00034286329173482955, "timestamp": "2025-09-30 22:12:37.380102", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.422461", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.000469776161480695, "timestamp": "2025-09-30 22:12:37.427254", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.474975", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.0003858681011479348, "timestamp": "2025-09-30 22:12:37.478664", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:37.512143", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.00017460745584685355, "timestamp": "2025-09-30 22:12:37.536265", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.568058", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.00015085657651070505, "timestamp": "2025-09-30 22:12:37.572465", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.605977", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.00038411474088206887, "timestamp": "2025-09-30 22:12:37.609455", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.652678", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.00023197698465082794, "timestamp": "2025-09-30 22:12:37.662867", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:37.704594", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.00028276038938201964, "timestamp": "2025-09-30 22:12:37.729499", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.762768", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 0.00018464835011400282, "timestamp": "2025-09-30 22:12:37.765646", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.803089", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.0001767083740560338, "timestamp": "2025-09-30 22:12:37.806072", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.858964", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.003367830766364932, "timestamp": "2025-09-30 22:12:37.863758", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:37.897678", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.0005176683189347386, "timestamp": "2025-09-30 22:12:37.922003", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.956779", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.00020833851885981858, "timestamp": "2025-09-30 22:12:37.959761", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:37.993816", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 0.00023042989778332412, "timestamp": "2025-09-30 22:12:37.996791", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:38.029406", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.00023371285351458937, "timestamp": "2025-09-30 22:12:38.032643", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.065029", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.00014216203999239951, "timestamp": "2025-09-30 22:12:38.091723", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.124290", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 0.00016069450066424906, "timestamp": "2025-09-30 22:12:38.127909", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.167316", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 0.00022106101096142083, "timestamp": "2025-09-30 22:12:38.170411", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.205407", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.00013788194337394089, "timestamp": "2025-09-30 22:12:38.208092", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.241971", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.0027592540718615055, "timestamp": "2025-09-30 22:12:38.269289", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.306088", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.00023793955915607512, "timestamp": "2025-09-30 22:12:38.308756", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.344310", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.00043959185131825507, "timestamp": "2025-09-30 22:12:38.347239", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:38.385744", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 0.00016678121755830944, "timestamp": "2025-09-30 22:12:38.388621", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.420558", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 0.0001507179404143244, "timestamp": "2025-09-30 22:12:38.445568", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.480184", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.00047512754099443555, "timestamp": "2025-09-30 22:12:38.483799", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:38.530889", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.05275354161858559, "timestamp": "2025-09-30 22:12:38.541284", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.576767", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.03382421284914017, "timestamp": "2025-09-30 22:12:38.580081", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.618144", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.00014904925774317235, "timestamp": "2025-09-30 22:12:38.642280", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.682395", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.002304724184796214, "timestamp": "2025-09-30 22:12:38.685149", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:38.724172", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 7.812363764969632e-05, "timestamp": "2025-09-30 22:12:38.726961", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.759301", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.010375364683568478, "timestamp": "2025-09-30 22:12:38.770156", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.802579", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.0035843809600919485, "timestamp": "2025-09-30 22:12:38.827050", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:38.860385", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.00012178834003861994, "timestamp": "2025-09-30 22:12:38.863867", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.901226", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 0.00018442458531353623, "timestamp": "2025-09-30 22:12:38.904128", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.938270", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.0412592850625515, "timestamp": "2025-09-30 22:12:38.940794", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:38.972591", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.0004322502645663917, "timestamp": "2025-09-30 22:12:38.996875", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.039280", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.00895337201654911, "timestamp": "2025-09-30 22:12:39.041770", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:39.073608", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.0011367687257006764, "timestamp": "2025-09-30 22:12:39.081069", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:39.116189", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.029959097504615784, "timestamp": "2025-09-30 22:12:39.121750", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.158945", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.0005665087956003845, "timestamp": "2025-09-30 22:12:39.182699", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.217942", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.021225055679678917, "timestamp": "2025-09-30 22:12:39.229996", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.273964", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.004430113825947046, "timestamp": "2025-09-30 22:12:39.277006", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.314672", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.0031106334645301104, "timestamp": "2025-09-30 22:12:39.317959", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:39.351094", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.0057603525929152966, "timestamp": "2025-09-30 22:12:39.375669", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:39.410439", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.005964639596641064, "timestamp": "2025-09-30 22:12:39.413100", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:39.449550", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.0006206769612617791, "timestamp": "2025-09-30 22:12:39.452539", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.489464", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.0002034508652286604, "timestamp": "2025-09-30 22:12:39.492379", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:39.531365", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.0005070780753158033, "timestamp": "2025-09-30 22:12:39.557330", "step": 2736, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:40.496739", "step": 2736, "epoch": 3 }, { "type": "pplx", "content": 38833778.565379806, "timestamp": "2025-09-30 22:12:40.501353", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.537420", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.00042862427653744817, "timestamp": "2025-09-30 22:12:40.543890", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.587420", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.0012384362053126097, "timestamp": "2025-09-30 22:12:40.595845", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.644008", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.0006894408725202084, "timestamp": "2025-09-30 22:12:40.651361", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.686317", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.0016596141504123807, "timestamp": "2025-09-30 22:12:40.717699", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:40.752169", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.0027973626274615526, "timestamp": "2025-09-30 22:12:40.756868", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.793972", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.0004706961044576019, "timestamp": "2025-09-30 22:12:40.802139", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:40.838051", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.014765026047825813, "timestamp": "2025-09-30 22:12:40.840754", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:40.876213", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.0023852067533880472, "timestamp": "2025-09-30 22:12:40.905606", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:40.957718", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.001915394444949925, "timestamp": "2025-09-30 22:12:40.963871", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:41.000241", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.048781026154756546, "timestamp": "2025-09-30 22:12:41.005196", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:41.036291", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.00042291748104617, "timestamp": "2025-09-30 22:12:41.039170", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-09-30 22:12:41.070797", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.0029773118440061808, "timestamp": "2025-09-30 22:12:41.094545", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:41.135632", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.010216223075985909, "timestamp": "2025-09-30 22:12:41.142428", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-09-30 22:12:41.173671", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.06497301906347275, "timestamp": "2025-09-30 22:12:41.176382", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-09-30 22:12:41.212088", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.0005495469667948782, "timestamp": "2025-09-30 22:12:41.215170", "step": 2751, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-09-30 22:12:42.223207", "step": 2751, "epoch": 3 }, { "type": "pplx", "content": 36628973.8921973, "timestamp": "2025-09-30 22:12:42.226115", "step": 2751, "epoch": 3 }, { "type": "best_pplx", "content": 36628973.8921973, "timestamp": "2025-09-30 22:12:42.228676", "step": 2751, "epoch": 3 }, { "type": "best_step", "content": 2751, "timestamp": "2025-09-30 22:12:42.230586", "step": 2751, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5014951860256000, "timestamp": "2025-09-30 22:12:42.232343", "step": 2751, "epoch": 3 }, { "type": "total_train_flops", "content": 10640863719936576, "timestamp": "2025-09-30 22:12:42.237411", "step": 2751, "epoch": 3 } ], "best_evals": { "pplx": { "score": 36628973.8921973, "step": 2751 }, "rougel": { "precision": 0.7598039215686274, "recall": 0.7598039215686274, "fmeasure": 0.7598039215686274 } } }