{ "training_args": { "output_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_3/nlu_mrpc_ff_v1", "overwrite_output_dir": false, "do_train": false, "do_eval": true, "do_predict": false, "eval_strategy": "steps", "prediction_loss_only": false, "per_device_train_batch_size": 4, "per_device_eval_batch_size": 8, "per_gpu_train_batch_size": null, "per_gpu_eval_batch_size": null, "gradient_accumulation_steps": 4, "eval_accumulation_steps": null, "eval_delay": 0, "torch_empty_cache_steps": null, "learning_rate": 2e-05, "weight_decay": 0.0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_epsilon": 1e-08, "max_grad_norm": 1.0, "num_train_epochs": 3, "max_steps": -1, "lr_scheduler_type": "linear", "lr_scheduler_kwargs": {}, "warmup_ratio": 0.0, "warmup_steps": 0, "log_level": "passive", "log_level_replica": "warning", "log_on_each_node": true, "logging_dir": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_3/nlu_mrpc_ff_v1/runs/Oct01_04-10-45_gx07", "logging_strategy": "steps", "logging_first_step": false, "logging_steps": 20, "logging_nan_inf_filter": true, "save_strategy": "epoch", "save_steps": 500, "save_total_limit": null, "save_safetensors": true, "save_on_each_node": false, "save_only_model": false, "restore_callback_states_from_checkpoint": false, "no_cuda": false, "use_cpu": false, "use_mps_device": false, "seed": 42, "data_seed": null, "jit_mode_eval": false, "use_ipex": false, "bf16": false, "fp16": false, "fp16_opt_level": "O1", "half_precision_backend": "auto", "bf16_full_eval": false, "fp16_full_eval": false, "tf32": null, "local_rank": 0, "ddp_backend": null, "tpu_num_cores": null, "tpu_metrics_debug": false, "debug": [], "dataloader_drop_last": false, "eval_steps": 57, "dataloader_num_workers": 0, "dataloader_prefetch_factor": null, "past_index": -1, "run_name": "/sc/projects/sci-herbrich/chair/lora-bp/valentin.teutschbein/adapters_3/nlu_mrpc_ff_v1", "disable_tqdm": false, "remove_unused_columns": true, "label_names": null, "load_best_model_at_end": false, "metric_for_best_model": null, "greater_is_better": null, "ignore_data_skip": false, "fsdp": [], "fsdp_min_num_params": 0, "fsdp_config": { "min_num_params": 0, "xla": false, "xla_fsdp_v2": false, "xla_fsdp_grad_ckpt": false }, "fsdp_transformer_layer_cls_to_wrap": null, "accelerator_config": { "split_batches": false, "dispatch_batches": null, "even_batches": true, "use_seedable_sampler": true, "non_blocking": false, "gradient_accumulation_kwargs": null }, "deepspeed": null, "label_smoothing_factor": 0.0, "optim": "adamw_torch", "optim_args": null, "adafactor": false, "group_by_length": false, "length_column_name": "length", "report_to": [], "ddp_find_unused_parameters": null, "ddp_bucket_cap_mb": null, "ddp_broadcast_buffers": null, "dataloader_pin_memory": true, "dataloader_persistent_workers": false, "skip_memory_metrics": true, "use_legacy_prediction_loop": false, "push_to_hub": false, "resume_from_checkpoint": null, "hub_model_id": null, "hub_strategy": "every_save", "hub_token": "", "hub_private_repo": null, "hub_always_push": false, "gradient_checkpointing": false, "gradient_checkpointing_kwargs": null, "include_inputs_for_metrics": false, "include_for_metrics": [], "eval_do_concat_batches": true, "fp16_backend": "auto", "push_to_hub_model_id": null, "push_to_hub_organization": null, "push_to_hub_token": "", "mp_parameters": "", "auto_find_batch_size": false, "full_determinism": false, "torchdynamo": null, "ray_scope": "last", "ddp_timeout": 1800, "torch_compile": false, "torch_compile_backend": null, "torch_compile_mode": null, "include_tokens_per_second": false, "include_num_input_tokens_seen": false, "neftune_noise_alpha": null, "optim_target_modules": null, "batch_eval_metrics": false, "eval_on_start": false, "use_liger_kernel": false, "eval_use_gather_object": false, "average_tokens_across_devices": false }, "lora_config": null, "flops": { "eval": 5014951860256000, "train": 10640863719936576, "total": 15655815580192576 }, "total": { "total": 59569.155660000004, "train": 45950.96976000001, "eval": 13618.1859 }, "logs": [ { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:10:53.095686", "step": 0, "epoch": 0 }, { "type": "pplx", "content": 226674977.87649825, "timestamp": "2025-10-01 04:10:53.105798", "step": 0, "epoch": 0 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:53.194138", "step": 0, "epoch": 1 }, { "type": "loss", "content": 0.703412652015686, "timestamp": "2025-10-01 04:10:53.200479", "step": 1, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.270265", "step": 1, "epoch": 1 }, { "type": "loss", "content": 0.699732780456543, "timestamp": "2025-10-01 04:10:53.285188", "step": 2, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.347661", "step": 2, "epoch": 1 }, { "type": "loss", "content": 0.7366790175437927, "timestamp": "2025-10-01 04:10:53.351307", "step": 3, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.391890", "step": 3, "epoch": 1 }, { "type": "loss", "content": 0.7117161154747009, "timestamp": "2025-10-01 04:10:53.461222", "step": 4, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.507281", "step": 4, "epoch": 1 }, { "type": "loss", "content": 0.12175176292657852, "timestamp": "2025-10-01 04:10:53.511748", "step": 5, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.569934", "step": 5, "epoch": 1 }, { "type": "loss", "content": 0.11768395453691483, "timestamp": "2025-10-01 04:10:53.575739", "step": 6, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:53.619858", "step": 6, "epoch": 1 }, { "type": "loss", "content": 0.12094881385564804, "timestamp": "2025-10-01 04:10:53.629033", "step": 7, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.676006", "step": 7, "epoch": 1 }, { "type": "loss", "content": 0.13165441155433655, "timestamp": "2025-10-01 04:10:53.704869", "step": 8, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.746265", "step": 8, "epoch": 1 }, { "type": "loss", "content": 0.010522748343646526, "timestamp": "2025-10-01 04:10:53.752986", "step": 9, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.795388", "step": 9, "epoch": 1 }, { "type": "loss", "content": 0.042851757258176804, "timestamp": "2025-10-01 04:10:53.800877", "step": 10, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.843002", "step": 10, "epoch": 1 }, { "type": "loss", "content": 0.026877496391534805, "timestamp": "2025-10-01 04:10:53.851789", "step": 11, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.898927", "step": 11, "epoch": 1 }, { "type": "loss", "content": 0.01061374880373478, "timestamp": "2025-10-01 04:10:53.926835", "step": 12, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:53.974466", "step": 12, "epoch": 1 }, { "type": "loss", "content": 0.025151683017611504, "timestamp": "2025-10-01 04:10:53.981624", "step": 13, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.027921", "step": 13, "epoch": 1 }, { "type": "loss", "content": 0.06414594501256943, "timestamp": "2025-10-01 04:10:54.034779", "step": 14, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.083254", "step": 14, "epoch": 1 }, { "type": "loss", "content": 0.042276572436094284, "timestamp": "2025-10-01 04:10:54.092531", "step": 15, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.142288", "step": 15, "epoch": 1 }, { "type": "loss", "content": 0.024483708664774895, "timestamp": "2025-10-01 04:10:54.173196", "step": 16, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.226170", "step": 16, "epoch": 1 }, { "type": "loss", "content": 0.02734944596886635, "timestamp": "2025-10-01 04:10:54.229034", "step": 17, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.279877", "step": 17, "epoch": 1 }, { "type": "loss", "content": 0.02907527983188629, "timestamp": "2025-10-01 04:10:54.285924", "step": 18, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:54.332981", "step": 18, "epoch": 1 }, { "type": "loss", "content": 0.02434028685092926, "timestamp": "2025-10-01 04:10:54.341022", "step": 19, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.389533", "step": 19, "epoch": 1 }, { "type": "loss", "content": 0.026969805359840393, "timestamp": "2025-10-01 04:10:54.418017", "step": 20, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:54.466528", "step": 20, "epoch": 1 }, { "type": "loss", "content": 0.031657036393880844, "timestamp": "2025-10-01 04:10:54.470219", "step": 21, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.509071", "step": 21, "epoch": 1 }, { "type": "loss", "content": 0.03591819852590561, "timestamp": "2025-10-01 04:10:54.516264", "step": 22, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.567711", "step": 22, "epoch": 1 }, { "type": "loss", "content": 0.025713395327329636, "timestamp": "2025-10-01 04:10:54.574780", "step": 23, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.623030", "step": 23, "epoch": 1 }, { "type": "loss", "content": 0.024056894704699516, "timestamp": "2025-10-01 04:10:54.653931", "step": 24, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.699467", "step": 24, "epoch": 1 }, { "type": "loss", "content": 0.022549999877810478, "timestamp": "2025-10-01 04:10:54.705025", "step": 25, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:54.761120", "step": 25, "epoch": 1 }, { "type": "loss", "content": 0.028963249176740646, "timestamp": "2025-10-01 04:10:54.769632", "step": 26, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:54.813599", "step": 26, "epoch": 1 }, { "type": "loss", "content": 0.020357603207230568, "timestamp": "2025-10-01 04:10:54.822712", "step": 27, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.866029", "step": 27, "epoch": 1 }, { "type": "loss", "content": 0.022364825010299683, "timestamp": "2025-10-01 04:10:54.895471", "step": 28, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:54.946141", "step": 28, "epoch": 1 }, { "type": "loss", "content": 0.03066479228436947, "timestamp": "2025-10-01 04:10:54.956692", "step": 29, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.014996", "step": 29, "epoch": 1 }, { "type": "loss", "content": 0.021274466067552567, "timestamp": "2025-10-01 04:10:55.026123", "step": 30, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.087929", "step": 30, "epoch": 1 }, { "type": "loss", "content": 0.02097862772643566, "timestamp": "2025-10-01 04:10:55.100150", "step": 31, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.157840", "step": 31, "epoch": 1 }, { "type": "loss", "content": 0.0193604938685894, "timestamp": "2025-10-01 04:10:55.189931", "step": 32, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.238573", "step": 32, "epoch": 1 }, { "type": "loss", "content": 0.011159072630107403, "timestamp": "2025-10-01 04:10:55.247628", "step": 33, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.298064", "step": 33, "epoch": 1 }, { "type": "loss", "content": 0.029572388157248497, "timestamp": "2025-10-01 04:10:55.306525", "step": 34, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.350752", "step": 34, "epoch": 1 }, { "type": "loss", "content": 0.011771670542657375, "timestamp": "2025-10-01 04:10:55.361310", "step": 35, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.411118", "step": 35, "epoch": 1 }, { "type": "loss", "content": 0.018943196162581444, "timestamp": "2025-10-01 04:10:55.441215", "step": 36, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.488907", "step": 36, "epoch": 1 }, { "type": "loss", "content": 0.014170250855386257, "timestamp": "2025-10-01 04:10:55.497674", "step": 37, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.547353", "step": 37, "epoch": 1 }, { "type": "loss", "content": 0.0197011586278677, "timestamp": "2025-10-01 04:10:55.555220", "step": 38, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:55.598036", "step": 38, "epoch": 1 }, { "type": "loss", "content": 0.010018707253038883, "timestamp": "2025-10-01 04:10:55.602924", "step": 39, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.647184", "step": 39, "epoch": 1 }, { "type": "loss", "content": 0.019700050354003906, "timestamp": "2025-10-01 04:10:55.674501", "step": 40, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.720735", "step": 40, "epoch": 1 }, { "type": "loss", "content": 0.008056686259806156, "timestamp": "2025-10-01 04:10:55.727517", "step": 41, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:55.773450", "step": 41, "epoch": 1 }, { "type": "loss", "content": 0.01834237389266491, "timestamp": "2025-10-01 04:10:55.783825", "step": 42, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.826968", "step": 42, "epoch": 1 }, { "type": "loss", "content": 0.03287697210907936, "timestamp": "2025-10-01 04:10:55.829460", "step": 43, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:10:55.890294", "step": 43, "epoch": 1 }, { "type": "loss", "content": 0.007924867793917656, "timestamp": "2025-10-01 04:10:55.914007", "step": 44, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.944304", "step": 44, "epoch": 1 }, { "type": "loss", "content": 0.006307397969067097, "timestamp": "2025-10-01 04:10:55.946373", "step": 45, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:55.976313", "step": 45, "epoch": 1 }, { "type": "loss", "content": 0.005574687384068966, "timestamp": "2025-10-01 04:10:55.978655", "step": 46, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.008745", "step": 46, "epoch": 1 }, { "type": "loss", "content": 0.037656109780073166, "timestamp": "2025-10-01 04:10:56.011071", "step": 47, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.041579", "step": 47, "epoch": 1 }, { "type": "loss", "content": 0.021889669820666313, "timestamp": "2025-10-01 04:10:56.065327", "step": 48, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.096017", "step": 48, "epoch": 1 }, { "type": "loss", "content": 0.004702998790889978, "timestamp": "2025-10-01 04:10:56.098076", "step": 49, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:56.131661", "step": 49, "epoch": 1 }, { "type": "loss", "content": 0.03982829302549362, "timestamp": "2025-10-01 04:10:56.134107", "step": 50, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.164382", "step": 50, "epoch": 1 }, { "type": "loss", "content": 0.005432396661490202, "timestamp": "2025-10-01 04:10:56.166655", "step": 51, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.197372", "step": 51, "epoch": 1 }, { "type": "loss", "content": 0.03515584021806717, "timestamp": "2025-10-01 04:10:56.221336", "step": 52, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.252932", "step": 52, "epoch": 1 }, { "type": "loss", "content": 0.036965373903512955, "timestamp": "2025-10-01 04:10:56.256656", "step": 53, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:10:56.287467", "step": 53, "epoch": 1 }, { "type": "loss", "content": 0.03869860619306564, "timestamp": "2025-10-01 04:10:56.289770", "step": 54, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:56.319993", "step": 54, "epoch": 1 }, { "type": "loss", "content": 0.019588610157370567, "timestamp": "2025-10-01 04:10:56.322701", "step": 55, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:56.353408", "step": 55, "epoch": 1 }, { "type": "loss", "content": 0.039842940866947174, "timestamp": "2025-10-01 04:10:56.377393", "step": 56, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:56.408377", "step": 56, "epoch": 1 }, { "type": "loss", "content": 0.020486917346715927, "timestamp": "2025-10-01 04:10:56.410470", "step": 57, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:10:57.137405", "step": 57, "epoch": 1 }, { "type": "pplx", "content": 104877281.17432675, "timestamp": "2025-10-01 04:10:57.139218", "step": 57, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.169214", "step": 57, "epoch": 1 }, { "type": "loss", "content": 0.01975156180560589, "timestamp": "2025-10-01 04:10:57.171568", "step": 58, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.206987", "step": 58, "epoch": 1 }, { "type": "loss", "content": 0.018474208191037178, "timestamp": "2025-10-01 04:10:57.209120", "step": 59, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.240574", "step": 59, "epoch": 1 }, { "type": "loss", "content": 0.020764997228980064, "timestamp": "2025-10-01 04:10:57.264650", "step": 60, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.298954", "step": 60, "epoch": 1 }, { "type": "loss", "content": 0.019322749227285385, "timestamp": "2025-10-01 04:10:57.301059", "step": 61, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:57.331441", "step": 61, "epoch": 1 }, { "type": "loss", "content": 0.020046358928084373, "timestamp": "2025-10-01 04:10:57.333636", "step": 62, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:57.363712", "step": 62, "epoch": 1 }, { "type": "loss", "content": 0.01184836309403181, "timestamp": "2025-10-01 04:10:57.366942", "step": 63, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.398273", "step": 63, "epoch": 1 }, { "type": "loss", "content": 0.010952807031571865, "timestamp": "2025-10-01 04:10:57.422021", "step": 64, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.453477", "step": 64, "epoch": 1 }, { "type": "loss", "content": 0.013404452241957188, "timestamp": "2025-10-01 04:10:57.455327", "step": 65, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:57.486321", "step": 65, "epoch": 1 }, { "type": "loss", "content": 0.02560088224709034, "timestamp": "2025-10-01 04:10:57.488536", "step": 66, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.520393", "step": 66, "epoch": 1 }, { "type": "loss", "content": 0.01485830545425415, "timestamp": "2025-10-01 04:10:57.522674", "step": 67, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.555043", "step": 67, "epoch": 1 }, { "type": "loss", "content": 0.024088138714432716, "timestamp": "2025-10-01 04:10:57.579039", "step": 68, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:57.615446", "step": 68, "epoch": 1 }, { "type": "loss", "content": 0.026860293000936508, "timestamp": "2025-10-01 04:10:57.617565", "step": 69, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.648283", "step": 69, "epoch": 1 }, { "type": "loss", "content": 0.019698916003108025, "timestamp": "2025-10-01 04:10:57.650365", "step": 70, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.681342", "step": 70, "epoch": 1 }, { "type": "loss", "content": 0.016542484983801842, "timestamp": "2025-10-01 04:10:57.683869", "step": 71, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.715002", "step": 71, "epoch": 1 }, { "type": "loss", "content": 0.026610536500811577, "timestamp": "2025-10-01 04:10:57.738746", "step": 72, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:57.769910", "step": 72, "epoch": 1 }, { "type": "loss", "content": 0.02022835798561573, "timestamp": "2025-10-01 04:10:57.772056", "step": 73, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.804012", "step": 73, "epoch": 1 }, { "type": "loss", "content": 0.02639939822256565, "timestamp": "2025-10-01 04:10:57.806345", "step": 74, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.837218", "step": 74, "epoch": 1 }, { "type": "loss", "content": 0.020459027960896492, "timestamp": "2025-10-01 04:10:57.839591", "step": 75, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:10:57.869623", "step": 75, "epoch": 1 }, { "type": "loss", "content": 0.022298503667116165, "timestamp": "2025-10-01 04:10:57.893644", "step": 76, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.928200", "step": 76, "epoch": 1 }, { "type": "loss", "content": 0.02111753635108471, "timestamp": "2025-10-01 04:10:57.930478", "step": 77, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.961168", "step": 77, "epoch": 1 }, { "type": "loss", "content": 0.025581015273928642, "timestamp": "2025-10-01 04:10:57.963498", "step": 78, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:57.994194", "step": 78, "epoch": 1 }, { "type": "loss", "content": 0.024683058261871338, "timestamp": "2025-10-01 04:10:57.996316", "step": 79, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:58.030327", "step": 79, "epoch": 1 }, { "type": "loss", "content": 0.02450932003557682, "timestamp": "2025-10-01 04:10:58.053881", "step": 80, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.086434", "step": 80, "epoch": 1 }, { "type": "loss", "content": 0.02360459603369236, "timestamp": "2025-10-01 04:10:58.088397", "step": 81, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.119496", "step": 81, "epoch": 1 }, { "type": "loss", "content": 0.02305099368095398, "timestamp": "2025-10-01 04:10:58.122557", "step": 82, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.153531", "step": 82, "epoch": 1 }, { "type": "loss", "content": 0.033070627599954605, "timestamp": "2025-10-01 04:10:58.156013", "step": 83, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.190179", "step": 83, "epoch": 1 }, { "type": "loss", "content": 0.026605265215039253, "timestamp": "2025-10-01 04:10:58.214044", "step": 84, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.245854", "step": 84, "epoch": 1 }, { "type": "loss", "content": 0.023719897493720055, "timestamp": "2025-10-01 04:10:58.248348", "step": 85, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.279713", "step": 85, "epoch": 1 }, { "type": "loss", "content": 0.016296805813908577, "timestamp": "2025-10-01 04:10:58.282101", "step": 86, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.313234", "step": 86, "epoch": 1 }, { "type": "loss", "content": 0.019035162404179573, "timestamp": "2025-10-01 04:10:58.315368", "step": 87, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.346123", "step": 87, "epoch": 1 }, { "type": "loss", "content": 0.018415704369544983, "timestamp": "2025-10-01 04:10:58.369870", "step": 88, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.408527", "step": 88, "epoch": 1 }, { "type": "loss", "content": 0.01595030352473259, "timestamp": "2025-10-01 04:10:58.411022", "step": 89, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:58.442729", "step": 89, "epoch": 1 }, { "type": "loss", "content": 0.02289186790585518, "timestamp": "2025-10-01 04:10:58.444877", "step": 90, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.475873", "step": 90, "epoch": 1 }, { "type": "loss", "content": 0.02066519856452942, "timestamp": "2025-10-01 04:10:58.477833", "step": 91, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:10:58.508103", "step": 91, "epoch": 1 }, { "type": "loss", "content": 0.02692834846675396, "timestamp": "2025-10-01 04:10:58.531867", "step": 92, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.562945", "step": 92, "epoch": 1 }, { "type": "loss", "content": 0.016894636675715446, "timestamp": "2025-10-01 04:10:58.565180", "step": 93, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.596827", "step": 93, "epoch": 1 }, { "type": "loss", "content": 0.02401311881840229, "timestamp": "2025-10-01 04:10:58.598918", "step": 94, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.629500", "step": 94, "epoch": 1 }, { "type": "loss", "content": 0.020219998434185982, "timestamp": "2025-10-01 04:10:58.631735", "step": 95, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.661362", "step": 95, "epoch": 1 }, { "type": "loss", "content": 0.02421734668314457, "timestamp": "2025-10-01 04:10:58.685139", "step": 96, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.717069", "step": 96, "epoch": 1 }, { "type": "loss", "content": 0.019959433004260063, "timestamp": "2025-10-01 04:10:58.719484", "step": 97, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.750834", "step": 97, "epoch": 1 }, { "type": "loss", "content": 0.021867400035262108, "timestamp": "2025-10-01 04:10:58.753140", "step": 98, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.784119", "step": 98, "epoch": 1 }, { "type": "loss", "content": 0.017920518293976784, "timestamp": "2025-10-01 04:10:58.786535", "step": 99, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.817171", "step": 99, "epoch": 1 }, { "type": "loss", "content": 0.020041782408952713, "timestamp": "2025-10-01 04:10:58.840975", "step": 100, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.872564", "step": 100, "epoch": 1 }, { "type": "loss", "content": 0.018996138125658035, "timestamp": "2025-10-01 04:10:58.874761", "step": 101, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.905882", "step": 101, "epoch": 1 }, { "type": "loss", "content": 0.01724168471992016, "timestamp": "2025-10-01 04:10:58.907742", "step": 102, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.939080", "step": 102, "epoch": 1 }, { "type": "loss", "content": 0.020902881398797035, "timestamp": "2025-10-01 04:10:58.941101", "step": 103, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:58.971484", "step": 103, "epoch": 1 }, { "type": "loss", "content": 0.027707403525710106, "timestamp": "2025-10-01 04:10:58.995345", "step": 104, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.026821", "step": 104, "epoch": 1 }, { "type": "loss", "content": 0.03331645205616951, "timestamp": "2025-10-01 04:10:59.029874", "step": 105, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.065983", "step": 105, "epoch": 1 }, { "type": "loss", "content": 0.009206474758684635, "timestamp": "2025-10-01 04:10:59.068593", "step": 106, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:59.099413", "step": 106, "epoch": 1 }, { "type": "loss", "content": 0.01976872608065605, "timestamp": "2025-10-01 04:10:59.101505", "step": 107, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:59.132651", "step": 107, "epoch": 1 }, { "type": "loss", "content": 0.04233280569314957, "timestamp": "2025-10-01 04:10:59.156597", "step": 108, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.188553", "step": 108, "epoch": 1 }, { "type": "loss", "content": 0.04243811219930649, "timestamp": "2025-10-01 04:10:59.190853", "step": 109, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:10:59.231560", "step": 109, "epoch": 1 }, { "type": "loss", "content": 0.03881783038377762, "timestamp": "2025-10-01 04:10:59.234032", "step": 110, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.264775", "step": 110, "epoch": 1 }, { "type": "loss", "content": 0.008653001859784126, "timestamp": "2025-10-01 04:10:59.267086", "step": 111, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.297628", "step": 111, "epoch": 1 }, { "type": "loss", "content": 0.027102604508399963, "timestamp": "2025-10-01 04:10:59.321517", "step": 112, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:10:59.358722", "step": 112, "epoch": 1 }, { "type": "loss", "content": 0.029516849666833878, "timestamp": "2025-10-01 04:10:59.373841", "step": 113, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:10:59.438262", "step": 113, "epoch": 1 }, { "type": "loss", "content": 0.021250639110803604, "timestamp": "2025-10-01 04:10:59.443863", "step": 114, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:00.689761", "step": 114, "epoch": 1 }, { "type": "pplx", "content": 119027336.35998133, "timestamp": "2025-10-01 04:11:00.691802", "step": 114, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:00.721252", "step": 114, "epoch": 1 }, { "type": "loss", "content": 0.010005722753703594, "timestamp": "2025-10-01 04:11:00.723303", "step": 115, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:00.754543", "step": 115, "epoch": 1 }, { "type": "loss", "content": 0.0302440095692873, "timestamp": "2025-10-01 04:11:00.778465", "step": 116, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:00.811694", "step": 116, "epoch": 1 }, { "type": "loss", "content": 0.021053824573755264, "timestamp": "2025-10-01 04:11:00.813938", "step": 117, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:00.844816", "step": 117, "epoch": 1 }, { "type": "loss", "content": 0.019699398428201675, "timestamp": "2025-10-01 04:11:00.846787", "step": 118, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:00.876873", "step": 118, "epoch": 1 }, { "type": "loss", "content": 0.018002357333898544, "timestamp": "2025-10-01 04:11:00.879564", "step": 119, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:00.910805", "step": 119, "epoch": 1 }, { "type": "loss", "content": 0.02030670829117298, "timestamp": "2025-10-01 04:11:00.934825", "step": 120, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:00.967310", "step": 120, "epoch": 1 }, { "type": "loss", "content": 0.030375460162758827, "timestamp": "2025-10-01 04:11:00.969369", "step": 121, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:00.999999", "step": 121, "epoch": 1 }, { "type": "loss", "content": 0.021546974778175354, "timestamp": "2025-10-01 04:11:01.002224", "step": 122, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:01.033058", "step": 122, "epoch": 1 }, { "type": "loss", "content": 0.022841254249215126, "timestamp": "2025-10-01 04:11:01.035390", "step": 123, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.066973", "step": 123, "epoch": 1 }, { "type": "loss", "content": 0.017218273133039474, "timestamp": "2025-10-01 04:11:01.090983", "step": 124, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.123155", "step": 124, "epoch": 1 }, { "type": "loss", "content": 0.019611822441220284, "timestamp": "2025-10-01 04:11:01.126793", "step": 125, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.162107", "step": 125, "epoch": 1 }, { "type": "loss", "content": 0.030845703557133675, "timestamp": "2025-10-01 04:11:01.164762", "step": 126, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:01.200174", "step": 126, "epoch": 1 }, { "type": "loss", "content": 0.020298536866903305, "timestamp": "2025-10-01 04:11:01.203233", "step": 127, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.237161", "step": 127, "epoch": 1 }, { "type": "loss", "content": 0.020675739273428917, "timestamp": "2025-10-01 04:11:01.261686", "step": 128, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.293347", "step": 128, "epoch": 1 }, { "type": "loss", "content": 0.02181154489517212, "timestamp": "2025-10-01 04:11:01.295522", "step": 129, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.326822", "step": 129, "epoch": 1 }, { "type": "loss", "content": 0.025619324296712875, "timestamp": "2025-10-01 04:11:01.329057", "step": 130, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.360457", "step": 130, "epoch": 1 }, { "type": "loss", "content": 0.028702473267912865, "timestamp": "2025-10-01 04:11:01.362897", "step": 131, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.394496", "step": 131, "epoch": 1 }, { "type": "loss", "content": 0.020739736035466194, "timestamp": "2025-10-01 04:11:01.418233", "step": 132, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.450579", "step": 132, "epoch": 1 }, { "type": "loss", "content": 0.01721821539103985, "timestamp": "2025-10-01 04:11:01.452685", "step": 133, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.483621", "step": 133, "epoch": 1 }, { "type": "loss", "content": 0.021848777309060097, "timestamp": "2025-10-01 04:11:01.485842", "step": 134, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:01.517014", "step": 134, "epoch": 1 }, { "type": "loss", "content": 0.02322843298316002, "timestamp": "2025-10-01 04:11:01.519106", "step": 135, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.551092", "step": 135, "epoch": 1 }, { "type": "loss", "content": 0.020518863573670387, "timestamp": "2025-10-01 04:11:01.574846", "step": 136, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:01.606313", "step": 136, "epoch": 1 }, { "type": "loss", "content": 0.020729046314954758, "timestamp": "2025-10-01 04:11:01.608280", "step": 137, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.639677", "step": 137, "epoch": 1 }, { "type": "loss", "content": 0.02201726660132408, "timestamp": "2025-10-01 04:11:01.642066", "step": 138, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.672490", "step": 138, "epoch": 1 }, { "type": "loss", "content": 0.01811814494431019, "timestamp": "2025-10-01 04:11:01.674619", "step": 139, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.705919", "step": 139, "epoch": 1 }, { "type": "loss", "content": 0.0218853447586298, "timestamp": "2025-10-01 04:11:01.731068", "step": 140, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.765941", "step": 140, "epoch": 1 }, { "type": "loss", "content": 0.01973516307771206, "timestamp": "2025-10-01 04:11:01.768961", "step": 141, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.810879", "step": 141, "epoch": 1 }, { "type": "loss", "content": 0.02040678821504116, "timestamp": "2025-10-01 04:11:01.813675", "step": 142, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:01.846417", "step": 142, "epoch": 1 }, { "type": "loss", "content": 0.02637314982712269, "timestamp": "2025-10-01 04:11:01.849125", "step": 143, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.882598", "step": 143, "epoch": 1 }, { "type": "loss", "content": 0.015394523739814758, "timestamp": "2025-10-01 04:11:01.908575", "step": 144, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.941974", "step": 144, "epoch": 1 }, { "type": "loss", "content": 0.016578085720539093, "timestamp": "2025-10-01 04:11:01.944314", "step": 145, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:01.977437", "step": 145, "epoch": 1 }, { "type": "loss", "content": 0.013782672584056854, "timestamp": "2025-10-01 04:11:01.980941", "step": 146, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.015604", "step": 146, "epoch": 1 }, { "type": "loss", "content": 0.029022570699453354, "timestamp": "2025-10-01 04:11:02.018395", "step": 147, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.051534", "step": 147, "epoch": 1 }, { "type": "loss", "content": 0.030791154131293297, "timestamp": "2025-10-01 04:11:02.075655", "step": 148, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:02.109123", "step": 148, "epoch": 1 }, { "type": "loss", "content": 0.04107801243662834, "timestamp": "2025-10-01 04:11:02.111665", "step": 149, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.144488", "step": 149, "epoch": 1 }, { "type": "loss", "content": 0.041130490601062775, "timestamp": "2025-10-01 04:11:02.147665", "step": 150, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.183413", "step": 150, "epoch": 1 }, { "type": "loss", "content": 0.019808387383818626, "timestamp": "2025-10-01 04:11:02.187533", "step": 151, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.225379", "step": 151, "epoch": 1 }, { "type": "loss", "content": 0.007825485430657864, "timestamp": "2025-10-01 04:11:02.251029", "step": 152, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:02.286208", "step": 152, "epoch": 1 }, { "type": "loss", "content": 0.013624719344079494, "timestamp": "2025-10-01 04:11:02.289379", "step": 153, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:02.323189", "step": 153, "epoch": 1 }, { "type": "loss", "content": 0.027168406173586845, "timestamp": "2025-10-01 04:11:02.325457", "step": 154, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.356991", "step": 154, "epoch": 1 }, { "type": "loss", "content": 0.032501544803380966, "timestamp": "2025-10-01 04:11:02.359146", "step": 155, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.390044", "step": 155, "epoch": 1 }, { "type": "loss", "content": 0.021303342655301094, "timestamp": "2025-10-01 04:11:02.413928", "step": 156, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.445555", "step": 156, "epoch": 1 }, { "type": "loss", "content": 0.032604530453681946, "timestamp": "2025-10-01 04:11:02.448009", "step": 157, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.478434", "step": 157, "epoch": 1 }, { "type": "loss", "content": 0.05039301887154579, "timestamp": "2025-10-01 04:11:02.480665", "step": 158, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:02.511680", "step": 158, "epoch": 1 }, { "type": "loss", "content": 0.039981160312891006, "timestamp": "2025-10-01 04:11:02.513943", "step": 159, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.544777", "step": 159, "epoch": 1 }, { "type": "loss", "content": 0.028644157573580742, "timestamp": "2025-10-01 04:11:02.568678", "step": 160, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.599445", "step": 160, "epoch": 1 }, { "type": "loss", "content": 0.02378627099096775, "timestamp": "2025-10-01 04:11:02.601720", "step": 161, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.632948", "step": 161, "epoch": 1 }, { "type": "loss", "content": 0.0275709368288517, "timestamp": "2025-10-01 04:11:02.635080", "step": 162, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.666605", "step": 162, "epoch": 1 }, { "type": "loss", "content": 0.02510923705995083, "timestamp": "2025-10-01 04:11:02.668605", "step": 163, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:02.699142", "step": 163, "epoch": 1 }, { "type": "loss", "content": 0.022733593359589577, "timestamp": "2025-10-01 04:11:02.723615", "step": 164, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.755022", "step": 164, "epoch": 1 }, { "type": "loss", "content": 0.023119447752833366, "timestamp": "2025-10-01 04:11:02.757374", "step": 165, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:02.787948", "step": 165, "epoch": 1 }, { "type": "loss", "content": 0.03037671558558941, "timestamp": "2025-10-01 04:11:02.790144", "step": 166, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.822656", "step": 166, "epoch": 1 }, { "type": "loss", "content": 0.02896520495414734, "timestamp": "2025-10-01 04:11:02.824787", "step": 167, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.855535", "step": 167, "epoch": 1 }, { "type": "loss", "content": 0.025343691930174828, "timestamp": "2025-10-01 04:11:02.879525", "step": 168, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.910741", "step": 168, "epoch": 1 }, { "type": "loss", "content": 0.024414045736193657, "timestamp": "2025-10-01 04:11:02.913245", "step": 169, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:02.944286", "step": 169, "epoch": 1 }, { "type": "loss", "content": 0.029930496588349342, "timestamp": "2025-10-01 04:11:02.946470", "step": 170, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:02.976836", "step": 170, "epoch": 1 }, { "type": "loss", "content": 0.024483921006321907, "timestamp": "2025-10-01 04:11:02.979151", "step": 171, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:03.712425", "step": 171, "epoch": 1 }, { "type": "pplx", "content": 120769265.99599761, "timestamp": "2025-10-01 04:11:03.714527", "step": 171, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:03.748965", "step": 171, "epoch": 1 }, { "type": "loss", "content": 0.024266691878437996, "timestamp": "2025-10-01 04:11:03.772876", "step": 172, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:03.804998", "step": 172, "epoch": 1 }, { "type": "loss", "content": 0.02391829900443554, "timestamp": "2025-10-01 04:11:03.807806", "step": 173, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:03.843090", "step": 173, "epoch": 1 }, { "type": "loss", "content": 0.023773299530148506, "timestamp": "2025-10-01 04:11:03.845386", "step": 174, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:03.880001", "step": 174, "epoch": 1 }, { "type": "loss", "content": 0.02384672500193119, "timestamp": "2025-10-01 04:11:03.882347", "step": 175, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:03.915356", "step": 175, "epoch": 1 }, { "type": "loss", "content": 0.025520270690321922, "timestamp": "2025-10-01 04:11:03.939925", "step": 176, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:03.973267", "step": 176, "epoch": 1 }, { "type": "loss", "content": 0.018843820318579674, "timestamp": "2025-10-01 04:11:03.976044", "step": 177, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.008928", "step": 177, "epoch": 1 }, { "type": "loss", "content": 0.018038298934698105, "timestamp": "2025-10-01 04:11:04.011534", "step": 178, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.047044", "step": 178, "epoch": 1 }, { "type": "loss", "content": 0.01665923371911049, "timestamp": "2025-10-01 04:11:04.049664", "step": 179, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.081583", "step": 179, "epoch": 1 }, { "type": "loss", "content": 0.024666251614689827, "timestamp": "2025-10-01 04:11:04.105542", "step": 180, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.139707", "step": 180, "epoch": 1 }, { "type": "loss", "content": 0.020171239972114563, "timestamp": "2025-10-01 04:11:04.144480", "step": 181, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.180714", "step": 181, "epoch": 1 }, { "type": "loss", "content": 0.030821431428194046, "timestamp": "2025-10-01 04:11:04.184291", "step": 182, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.221497", "step": 182, "epoch": 1 }, { "type": "loss", "content": 0.05390426889061928, "timestamp": "2025-10-01 04:11:04.225907", "step": 183, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.263174", "step": 183, "epoch": 1 }, { "type": "loss", "content": 0.03358237445354462, "timestamp": "2025-10-01 04:11:04.289628", "step": 184, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:04.327093", "step": 184, "epoch": 1 }, { "type": "loss", "content": 0.04122252017259598, "timestamp": "2025-10-01 04:11:04.331695", "step": 185, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.369519", "step": 185, "epoch": 1 }, { "type": "loss", "content": 0.021640826016664505, "timestamp": "2025-10-01 04:11:04.372368", "step": 186, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.406874", "step": 186, "epoch": 1 }, { "type": "loss", "content": 0.03316246345639229, "timestamp": "2025-10-01 04:11:04.410071", "step": 187, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.444007", "step": 187, "epoch": 1 }, { "type": "loss", "content": 0.027355531230568886, "timestamp": "2025-10-01 04:11:04.469025", "step": 188, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.506061", "step": 188, "epoch": 1 }, { "type": "loss", "content": 0.017095940187573433, "timestamp": "2025-10-01 04:11:04.509501", "step": 189, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.545675", "step": 189, "epoch": 1 }, { "type": "loss", "content": 0.021681906655430794, "timestamp": "2025-10-01 04:11:04.549775", "step": 190, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.588467", "step": 190, "epoch": 1 }, { "type": "loss", "content": 0.019009903073310852, "timestamp": "2025-10-01 04:11:04.592739", "step": 191, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:04.628572", "step": 191, "epoch": 1 }, { "type": "loss", "content": 0.020892620086669922, "timestamp": "2025-10-01 04:11:04.653945", "step": 192, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.690114", "step": 192, "epoch": 1 }, { "type": "loss", "content": 0.00846900511533022, "timestamp": "2025-10-01 04:11:04.694251", "step": 193, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.730830", "step": 193, "epoch": 1 }, { "type": "loss", "content": 0.015313724987208843, "timestamp": "2025-10-01 04:11:04.734818", "step": 194, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.772292", "step": 194, "epoch": 1 }, { "type": "loss", "content": 0.017627518624067307, "timestamp": "2025-10-01 04:11:04.776483", "step": 195, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.814091", "step": 195, "epoch": 1 }, { "type": "loss", "content": 0.018919754773378372, "timestamp": "2025-10-01 04:11:04.839417", "step": 196, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.875244", "step": 196, "epoch": 1 }, { "type": "loss", "content": 0.00864213053137064, "timestamp": "2025-10-01 04:11:04.879592", "step": 197, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:04.916555", "step": 197, "epoch": 1 }, { "type": "loss", "content": 0.021550923585891724, "timestamp": "2025-10-01 04:11:04.920440", "step": 198, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:04.957114", "step": 198, "epoch": 1 }, { "type": "loss", "content": 0.04059647396206856, "timestamp": "2025-10-01 04:11:04.960365", "step": 199, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:04.995341", "step": 199, "epoch": 1 }, { "type": "loss", "content": 0.03375014290213585, "timestamp": "2025-10-01 04:11:05.020901", "step": 200, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:05.057601", "step": 200, "epoch": 1 }, { "type": "loss", "content": 0.015561387874186039, "timestamp": "2025-10-01 04:11:05.061688", "step": 201, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.100943", "step": 201, "epoch": 1 }, { "type": "loss", "content": 0.019198428839445114, "timestamp": "2025-10-01 04:11:05.104906", "step": 202, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.141443", "step": 202, "epoch": 1 }, { "type": "loss", "content": 0.020807581022381783, "timestamp": "2025-10-01 04:11:05.146759", "step": 203, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:05.187110", "step": 203, "epoch": 1 }, { "type": "loss", "content": 0.021419154480099678, "timestamp": "2025-10-01 04:11:05.214280", "step": 204, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:05.260220", "step": 204, "epoch": 1 }, { "type": "loss", "content": 0.025552844628691673, "timestamp": "2025-10-01 04:11:05.266775", "step": 205, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.310785", "step": 205, "epoch": 1 }, { "type": "loss", "content": 0.00911114364862442, "timestamp": "2025-10-01 04:11:05.318082", "step": 206, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.363217", "step": 206, "epoch": 1 }, { "type": "loss", "content": 0.03169438615441322, "timestamp": "2025-10-01 04:11:05.369286", "step": 207, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.415227", "step": 207, "epoch": 1 }, { "type": "loss", "content": 0.01915550045669079, "timestamp": "2025-10-01 04:11:05.443789", "step": 208, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.493723", "step": 208, "epoch": 1 }, { "type": "loss", "content": 0.029754187911748886, "timestamp": "2025-10-01 04:11:05.498853", "step": 209, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.536518", "step": 209, "epoch": 1 }, { "type": "loss", "content": 0.03260469064116478, "timestamp": "2025-10-01 04:11:05.541058", "step": 210, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.581991", "step": 210, "epoch": 1 }, { "type": "loss", "content": 0.017987538129091263, "timestamp": "2025-10-01 04:11:05.586146", "step": 211, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.626067", "step": 211, "epoch": 1 }, { "type": "loss", "content": 0.03934092074632645, "timestamp": "2025-10-01 04:11:05.652596", "step": 212, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.693603", "step": 212, "epoch": 1 }, { "type": "loss", "content": 0.028932850807905197, "timestamp": "2025-10-01 04:11:05.697708", "step": 213, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.735723", "step": 213, "epoch": 1 }, { "type": "loss", "content": 0.020509885624051094, "timestamp": "2025-10-01 04:11:05.739455", "step": 214, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.775868", "step": 214, "epoch": 1 }, { "type": "loss", "content": 0.023546868935227394, "timestamp": "2025-10-01 04:11:05.779786", "step": 215, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.817622", "step": 215, "epoch": 1 }, { "type": "loss", "content": 0.024636918678879738, "timestamp": "2025-10-01 04:11:05.843730", "step": 216, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:05.881921", "step": 216, "epoch": 1 }, { "type": "loss", "content": 0.025502773001790047, "timestamp": "2025-10-01 04:11:05.885295", "step": 217, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:05.921181", "step": 217, "epoch": 1 }, { "type": "loss", "content": 0.029600782319903374, "timestamp": "2025-10-01 04:11:05.926059", "step": 218, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:05.962824", "step": 218, "epoch": 1 }, { "type": "loss", "content": 0.028552714735269547, "timestamp": "2025-10-01 04:11:05.969938", "step": 219, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.007101", "step": 219, "epoch": 1 }, { "type": "loss", "content": 0.021603135392069817, "timestamp": "2025-10-01 04:11:06.033295", "step": 220, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.069385", "step": 220, "epoch": 1 }, { "type": "loss", "content": 0.021389275789260864, "timestamp": "2025-10-01 04:11:06.072922", "step": 221, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.109361", "step": 221, "epoch": 1 }, { "type": "loss", "content": 0.023018766194581985, "timestamp": "2025-10-01 04:11:06.113062", "step": 222, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.151821", "step": 222, "epoch": 1 }, { "type": "loss", "content": 0.023001650348305702, "timestamp": "2025-10-01 04:11:06.155483", "step": 223, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.195371", "step": 223, "epoch": 1 }, { "type": "loss", "content": 0.026992013677954674, "timestamp": "2025-10-01 04:11:06.221985", "step": 224, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.264315", "step": 224, "epoch": 1 }, { "type": "loss", "content": 0.020342260599136353, "timestamp": "2025-10-01 04:11:06.269341", "step": 225, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.309157", "step": 225, "epoch": 1 }, { "type": "loss", "content": 0.022230759263038635, "timestamp": "2025-10-01 04:11:06.314884", "step": 226, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:06.355713", "step": 226, "epoch": 1 }, { "type": "loss", "content": 0.02584499679505825, "timestamp": "2025-10-01 04:11:06.359995", "step": 227, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:06.398943", "step": 227, "epoch": 1 }, { "type": "loss", "content": 0.028999576345086098, "timestamp": "2025-10-01 04:11:06.426940", "step": 228, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:07.366202", "step": 228, "epoch": 1 }, { "type": "pplx", "content": 122809426.57628937, "timestamp": "2025-10-01 04:11:07.369426", "step": 228, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:07.399317", "step": 228, "epoch": 1 }, { "type": "loss", "content": 0.023775307461619377, "timestamp": "2025-10-01 04:11:07.404664", "step": 229, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.442257", "step": 229, "epoch": 1 }, { "type": "loss", "content": 0.026211675256490707, "timestamp": "2025-10-01 04:11:07.446943", "step": 230, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.485575", "step": 230, "epoch": 1 }, { "type": "loss", "content": 0.026004331186413765, "timestamp": "2025-10-01 04:11:07.489770", "step": 231, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.524818", "step": 231, "epoch": 1 }, { "type": "loss", "content": 0.027609150856733322, "timestamp": "2025-10-01 04:11:07.549932", "step": 232, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:07.587581", "step": 232, "epoch": 1 }, { "type": "loss", "content": 0.02476361393928528, "timestamp": "2025-10-01 04:11:07.592182", "step": 233, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.628455", "step": 233, "epoch": 1 }, { "type": "loss", "content": 0.023204132914543152, "timestamp": "2025-10-01 04:11:07.638754", "step": 234, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.688423", "step": 234, "epoch": 1 }, { "type": "loss", "content": 0.022222131490707397, "timestamp": "2025-10-01 04:11:07.692520", "step": 235, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.728492", "step": 235, "epoch": 1 }, { "type": "loss", "content": 0.02399165742099285, "timestamp": "2025-10-01 04:11:07.755397", "step": 236, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:07.792695", "step": 236, "epoch": 1 }, { "type": "loss", "content": 0.01936420612037182, "timestamp": "2025-10-01 04:11:07.796632", "step": 237, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.832715", "step": 237, "epoch": 1 }, { "type": "loss", "content": 0.027001459151506424, "timestamp": "2025-10-01 04:11:07.835200", "step": 238, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.870834", "step": 238, "epoch": 1 }, { "type": "loss", "content": 0.03520625829696655, "timestamp": "2025-10-01 04:11:07.874883", "step": 239, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:07.915586", "step": 239, "epoch": 1 }, { "type": "loss", "content": 0.02217874489724636, "timestamp": "2025-10-01 04:11:07.947022", "step": 240, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:07.984484", "step": 240, "epoch": 1 }, { "type": "loss", "content": 0.017390580847859383, "timestamp": "2025-10-01 04:11:07.988482", "step": 241, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.026755", "step": 241, "epoch": 1 }, { "type": "loss", "content": 0.011072313413023949, "timestamp": "2025-10-01 04:11:08.030594", "step": 242, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.065665", "step": 242, "epoch": 1 }, { "type": "loss", "content": 0.00991674792021513, "timestamp": "2025-10-01 04:11:08.069462", "step": 243, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.105643", "step": 243, "epoch": 1 }, { "type": "loss", "content": 0.015564543195068836, "timestamp": "2025-10-01 04:11:08.131957", "step": 244, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.169739", "step": 244, "epoch": 1 }, { "type": "loss", "content": 0.044475387781858444, "timestamp": "2025-10-01 04:11:08.180027", "step": 245, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.230583", "step": 245, "epoch": 1 }, { "type": "loss", "content": 0.042544569820165634, "timestamp": "2025-10-01 04:11:08.235633", "step": 246, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.275466", "step": 246, "epoch": 1 }, { "type": "loss", "content": 0.03518744558095932, "timestamp": "2025-10-01 04:11:08.281910", "step": 247, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.323556", "step": 247, "epoch": 1 }, { "type": "loss", "content": 0.03052806854248047, "timestamp": "2025-10-01 04:11:08.351201", "step": 248, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.391918", "step": 248, "epoch": 1 }, { "type": "loss", "content": 0.04761296510696411, "timestamp": "2025-10-01 04:11:08.400264", "step": 249, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.454453", "step": 249, "epoch": 1 }, { "type": "loss", "content": 0.015664685517549515, "timestamp": "2025-10-01 04:11:08.457280", "step": 250, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.496811", "step": 250, "epoch": 1 }, { "type": "loss", "content": 0.03577407822012901, "timestamp": "2025-10-01 04:11:08.502152", "step": 251, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.544633", "step": 251, "epoch": 1 }, { "type": "loss", "content": 0.007992579601705074, "timestamp": "2025-10-01 04:11:08.572372", "step": 252, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.612756", "step": 252, "epoch": 1 }, { "type": "loss", "content": 0.00690767215564847, "timestamp": "2025-10-01 04:11:08.617787", "step": 253, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.657444", "step": 253, "epoch": 1 }, { "type": "loss", "content": 0.031856562942266464, "timestamp": "2025-10-01 04:11:08.661796", "step": 254, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.702019", "step": 254, "epoch": 1 }, { "type": "loss", "content": 0.014500172808766365, "timestamp": "2025-10-01 04:11:08.706666", "step": 255, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:08.745934", "step": 255, "epoch": 1 }, { "type": "loss", "content": 0.023240460082888603, "timestamp": "2025-10-01 04:11:08.772639", "step": 256, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.815253", "step": 256, "epoch": 1 }, { "type": "loss", "content": 0.010203330777585506, "timestamp": "2025-10-01 04:11:08.819684", "step": 257, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.857505", "step": 257, "epoch": 1 }, { "type": "loss", "content": 0.04106441140174866, "timestamp": "2025-10-01 04:11:08.862585", "step": 258, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:08.902139", "step": 258, "epoch": 1 }, { "type": "loss", "content": 0.028442902490496635, "timestamp": "2025-10-01 04:11:08.907438", "step": 259, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:08.948737", "step": 259, "epoch": 1 }, { "type": "loss", "content": 0.01924443431198597, "timestamp": "2025-10-01 04:11:08.976364", "step": 260, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.014149", "step": 260, "epoch": 1 }, { "type": "loss", "content": 0.02226688340306282, "timestamp": "2025-10-01 04:11:09.019034", "step": 261, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.058548", "step": 261, "epoch": 1 }, { "type": "loss", "content": 0.02148137055337429, "timestamp": "2025-10-01 04:11:09.063563", "step": 262, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.103810", "step": 262, "epoch": 1 }, { "type": "loss", "content": 0.021183112636208534, "timestamp": "2025-10-01 04:11:09.109446", "step": 263, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.149647", "step": 263, "epoch": 1 }, { "type": "loss", "content": 0.012026933021843433, "timestamp": "2025-10-01 04:11:09.177157", "step": 264, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.222469", "step": 264, "epoch": 1 }, { "type": "loss", "content": 0.022156137973070145, "timestamp": "2025-10-01 04:11:09.229074", "step": 265, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.271871", "step": 265, "epoch": 1 }, { "type": "loss", "content": 0.022157371044158936, "timestamp": "2025-10-01 04:11:09.276591", "step": 266, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:09.313687", "step": 266, "epoch": 1 }, { "type": "loss", "content": 0.02283759042620659, "timestamp": "2025-10-01 04:11:09.326584", "step": 267, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.381184", "step": 267, "epoch": 1 }, { "type": "loss", "content": 0.0349494144320488, "timestamp": "2025-10-01 04:11:09.412786", "step": 268, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.468443", "step": 268, "epoch": 1 }, { "type": "loss", "content": 0.01891479082405567, "timestamp": "2025-10-01 04:11:09.479449", "step": 269, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.539317", "step": 269, "epoch": 1 }, { "type": "loss", "content": 0.022123845294117928, "timestamp": "2025-10-01 04:11:09.548528", "step": 270, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.606951", "step": 270, "epoch": 1 }, { "type": "loss", "content": 0.022634102031588554, "timestamp": "2025-10-01 04:11:09.615883", "step": 271, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.664869", "step": 271, "epoch": 1 }, { "type": "loss", "content": 0.021512528881430626, "timestamp": "2025-10-01 04:11:09.695452", "step": 272, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.744714", "step": 272, "epoch": 1 }, { "type": "loss", "content": 0.022766796872019768, "timestamp": "2025-10-01 04:11:09.754152", "step": 273, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:09.809403", "step": 273, "epoch": 1 }, { "type": "loss", "content": 0.02063952013850212, "timestamp": "2025-10-01 04:11:09.817055", "step": 274, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:09.865832", "step": 274, "epoch": 1 }, { "type": "loss", "content": 0.02158406563103199, "timestamp": "2025-10-01 04:11:09.873314", "step": 275, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:09.927481", "step": 275, "epoch": 1 }, { "type": "loss", "content": 0.02682666666805744, "timestamp": "2025-10-01 04:11:09.958874", "step": 276, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.004073", "step": 276, "epoch": 1 }, { "type": "loss", "content": 0.017530372366309166, "timestamp": "2025-10-01 04:11:10.013130", "step": 277, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.060282", "step": 277, "epoch": 1 }, { "type": "loss", "content": 0.026719707995653152, "timestamp": "2025-10-01 04:11:10.067797", "step": 278, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:10.117019", "step": 278, "epoch": 1 }, { "type": "loss", "content": 0.024190085008740425, "timestamp": "2025-10-01 04:11:10.125956", "step": 279, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:10.177319", "step": 279, "epoch": 1 }, { "type": "loss", "content": 0.03567364439368248, "timestamp": "2025-10-01 04:11:10.209699", "step": 280, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.256584", "step": 280, "epoch": 1 }, { "type": "loss", "content": 0.01976270042359829, "timestamp": "2025-10-01 04:11:10.289482", "step": 281, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.326949", "step": 281, "epoch": 1 }, { "type": "loss", "content": 0.018775565549731255, "timestamp": "2025-10-01 04:11:10.337053", "step": 282, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.385431", "step": 282, "epoch": 1 }, { "type": "loss", "content": 0.022568348795175552, "timestamp": "2025-10-01 04:11:10.394946", "step": 283, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:10.449806", "step": 283, "epoch": 1 }, { "type": "loss", "content": 0.029433075338602066, "timestamp": "2025-10-01 04:11:10.481502", "step": 284, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:10.541012", "step": 284, "epoch": 1 }, { "type": "loss", "content": 0.02517537772655487, "timestamp": "2025-10-01 04:11:10.551862", "step": 285, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:12.023011", "step": 285, "epoch": 1 }, { "type": "pplx", "content": 123458419.66125065, "timestamp": "2025-10-01 04:11:12.033811", "step": 285, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.075459", "step": 285, "epoch": 1 }, { "type": "loss", "content": 0.01756308041512966, "timestamp": "2025-10-01 04:11:12.086183", "step": 286, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:12.139662", "step": 286, "epoch": 1 }, { "type": "loss", "content": 0.021046658977866173, "timestamp": "2025-10-01 04:11:12.150758", "step": 287, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:12.199184", "step": 287, "epoch": 1 }, { "type": "loss", "content": 0.02669466845691204, "timestamp": "2025-10-01 04:11:12.231814", "step": 288, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.266605", "step": 288, "epoch": 1 }, { "type": "loss", "content": 0.025276560336351395, "timestamp": "2025-10-01 04:11:12.275277", "step": 289, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.325581", "step": 289, "epoch": 1 }, { "type": "loss", "content": 0.017593776807188988, "timestamp": "2025-10-01 04:11:12.336113", "step": 290, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.385996", "step": 290, "epoch": 1 }, { "type": "loss", "content": 0.02213350310921669, "timestamp": "2025-10-01 04:11:12.389863", "step": 291, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:12.441883", "step": 291, "epoch": 1 }, { "type": "loss", "content": 0.024864699691534042, "timestamp": "2025-10-01 04:11:12.469387", "step": 292, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.527746", "step": 292, "epoch": 1 }, { "type": "loss", "content": 0.018032578751444817, "timestamp": "2025-10-01 04:11:12.531862", "step": 293, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.592939", "step": 293, "epoch": 1 }, { "type": "loss", "content": 0.025712715461850166, "timestamp": "2025-10-01 04:11:12.598654", "step": 294, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.644955", "step": 294, "epoch": 1 }, { "type": "loss", "content": 0.021214189007878304, "timestamp": "2025-10-01 04:11:12.649453", "step": 295, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:12.713329", "step": 295, "epoch": 1 }, { "type": "loss", "content": 0.017967000603675842, "timestamp": "2025-10-01 04:11:12.739521", "step": 296, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.801217", "step": 296, "epoch": 1 }, { "type": "loss", "content": 0.020910972729325294, "timestamp": "2025-10-01 04:11:12.808802", "step": 297, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:12.866167", "step": 297, "epoch": 1 }, { "type": "loss", "content": 0.025297032669186592, "timestamp": "2025-10-01 04:11:12.869763", "step": 298, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:12.925348", "step": 298, "epoch": 1 }, { "type": "loss", "content": 0.01858631707727909, "timestamp": "2025-10-01 04:11:12.936219", "step": 299, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:12.992095", "step": 299, "epoch": 1 }, { "type": "loss", "content": 0.018205394968390465, "timestamp": "2025-10-01 04:11:13.024923", "step": 300, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.068494", "step": 300, "epoch": 1 }, { "type": "loss", "content": 0.02749958448112011, "timestamp": "2025-10-01 04:11:13.071592", "step": 301, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:13.122732", "step": 301, "epoch": 1 }, { "type": "loss", "content": 0.012391953729093075, "timestamp": "2025-10-01 04:11:13.132353", "step": 302, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.185938", "step": 302, "epoch": 1 }, { "type": "loss", "content": 0.01051428634673357, "timestamp": "2025-10-01 04:11:13.195347", "step": 303, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.242587", "step": 303, "epoch": 1 }, { "type": "loss", "content": 0.017451424151659012, "timestamp": "2025-10-01 04:11:13.267884", "step": 304, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.314376", "step": 304, "epoch": 1 }, { "type": "loss", "content": 0.019919848069548607, "timestamp": "2025-10-01 04:11:13.317479", "step": 305, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.366986", "step": 305, "epoch": 1 }, { "type": "loss", "content": 0.03193918988108635, "timestamp": "2025-10-01 04:11:13.371398", "step": 306, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.424406", "step": 306, "epoch": 1 }, { "type": "loss", "content": 0.030981769785284996, "timestamp": "2025-10-01 04:11:13.434069", "step": 307, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.477614", "step": 307, "epoch": 1 }, { "type": "loss", "content": 0.020484397187829018, "timestamp": "2025-10-01 04:11:13.508874", "step": 308, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.563311", "step": 308, "epoch": 1 }, { "type": "loss", "content": 0.006139458157122135, "timestamp": "2025-10-01 04:11:13.567099", "step": 309, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.605957", "step": 309, "epoch": 1 }, { "type": "loss", "content": 0.04061734676361084, "timestamp": "2025-10-01 04:11:13.615570", "step": 310, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.661155", "step": 310, "epoch": 1 }, { "type": "loss", "content": 0.03407382592558861, "timestamp": "2025-10-01 04:11:13.665953", "step": 311, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.713417", "step": 311, "epoch": 1 }, { "type": "loss", "content": 0.018781576305627823, "timestamp": "2025-10-01 04:11:13.743162", "step": 312, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.789236", "step": 312, "epoch": 1 }, { "type": "loss", "content": 0.02269078977406025, "timestamp": "2025-10-01 04:11:13.797855", "step": 313, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.832261", "step": 313, "epoch": 1 }, { "type": "loss", "content": 0.017462121322751045, "timestamp": "2025-10-01 04:11:13.836397", "step": 314, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.879321", "step": 314, "epoch": 1 }, { "type": "loss", "content": 0.0337679348886013, "timestamp": "2025-10-01 04:11:13.882439", "step": 315, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:13.928027", "step": 315, "epoch": 1 }, { "type": "loss", "content": 0.005852936767041683, "timestamp": "2025-10-01 04:11:13.953685", "step": 316, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.012234", "step": 316, "epoch": 1 }, { "type": "loss", "content": 0.015508134849369526, "timestamp": "2025-10-01 04:11:14.016350", "step": 317, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:14.063073", "step": 317, "epoch": 1 }, { "type": "loss", "content": 0.030224744230508804, "timestamp": "2025-10-01 04:11:14.073973", "step": 318, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.128368", "step": 318, "epoch": 1 }, { "type": "loss", "content": 0.01716790907084942, "timestamp": "2025-10-01 04:11:14.137149", "step": 319, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:14.181796", "step": 319, "epoch": 1 }, { "type": "loss", "content": 0.020314916968345642, "timestamp": "2025-10-01 04:11:14.207770", "step": 320, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:14.243735", "step": 320, "epoch": 1 }, { "type": "loss", "content": 0.02022932842373848, "timestamp": "2025-10-01 04:11:14.252250", "step": 321, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.287561", "step": 321, "epoch": 1 }, { "type": "loss", "content": 0.03518899530172348, "timestamp": "2025-10-01 04:11:14.292085", "step": 322, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.329769", "step": 322, "epoch": 1 }, { "type": "loss", "content": 0.03895551338791847, "timestamp": "2025-10-01 04:11:14.333485", "step": 323, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:14.381017", "step": 323, "epoch": 1 }, { "type": "loss", "content": 0.030202005058526993, "timestamp": "2025-10-01 04:11:14.413907", "step": 324, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:14.451810", "step": 324, "epoch": 1 }, { "type": "loss", "content": 0.02211807295680046, "timestamp": "2025-10-01 04:11:14.459989", "step": 325, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.508668", "step": 325, "epoch": 1 }, { "type": "loss", "content": 0.025438370183110237, "timestamp": "2025-10-01 04:11:14.518422", "step": 326, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.568591", "step": 326, "epoch": 1 }, { "type": "loss", "content": 0.028444336727261543, "timestamp": "2025-10-01 04:11:14.571132", "step": 327, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:14.621409", "step": 327, "epoch": 1 }, { "type": "loss", "content": 0.026081737130880356, "timestamp": "2025-10-01 04:11:14.651185", "step": 328, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:14.689696", "step": 328, "epoch": 1 }, { "type": "loss", "content": 0.026819607242941856, "timestamp": "2025-10-01 04:11:14.701449", "step": 329, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.758718", "step": 329, "epoch": 1 }, { "type": "loss", "content": 0.024707946926355362, "timestamp": "2025-10-01 04:11:14.773714", "step": 330, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:14.837086", "step": 330, "epoch": 1 }, { "type": "loss", "content": 0.021115347743034363, "timestamp": "2025-10-01 04:11:14.850224", "step": 331, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:14.913208", "step": 331, "epoch": 1 }, { "type": "loss", "content": 0.022157754749059677, "timestamp": "2025-10-01 04:11:14.947292", "step": 332, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:15.016196", "step": 332, "epoch": 1 }, { "type": "loss", "content": 0.02936716005206108, "timestamp": "2025-10-01 04:11:15.028300", "step": 333, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:15.082357", "step": 333, "epoch": 1 }, { "type": "loss", "content": 0.0242962297052145, "timestamp": "2025-10-01 04:11:15.093062", "step": 334, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:15.152137", "step": 334, "epoch": 1 }, { "type": "loss", "content": 0.024363428354263306, "timestamp": "2025-10-01 04:11:15.161507", "step": 335, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:15.215923", "step": 335, "epoch": 1 }, { "type": "loss", "content": 0.030734961852431297, "timestamp": "2025-10-01 04:11:15.247500", "step": 336, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:15.290273", "step": 336, "epoch": 1 }, { "type": "loss", "content": 0.03119891881942749, "timestamp": "2025-10-01 04:11:15.298587", "step": 337, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:15.352209", "step": 337, "epoch": 1 }, { "type": "loss", "content": 0.027752574533224106, "timestamp": "2025-10-01 04:11:15.361906", "step": 338, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:15.419423", "step": 338, "epoch": 1 }, { "type": "loss", "content": 0.023446708917617798, "timestamp": "2025-10-01 04:11:15.423173", "step": 339, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:15.478963", "step": 339, "epoch": 1 }, { "type": "loss", "content": 0.02432301454246044, "timestamp": "2025-10-01 04:11:15.510136", "step": 340, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:15.563526", "step": 340, "epoch": 1 }, { "type": "loss", "content": 0.0277759600430727, "timestamp": "2025-10-01 04:11:15.571468", "step": 341, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:15.626770", "step": 341, "epoch": 1 }, { "type": "loss", "content": 0.026494231075048447, "timestamp": "2025-10-01 04:11:15.636723", "step": 342, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:16.887801", "step": 342, "epoch": 1 }, { "type": "pplx", "content": 119641692.44329666, "timestamp": "2025-10-01 04:11:16.892772", "step": 342, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:16.929417", "step": 342, "epoch": 1 }, { "type": "loss", "content": 0.025429880246520042, "timestamp": "2025-10-01 04:11:16.934622", "step": 343, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:16.976165", "step": 343, "epoch": 1 }, { "type": "loss", "content": 0.02810918726027012, "timestamp": "2025-10-01 04:11:17.000203", "step": 344, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.036728", "step": 344, "epoch": 1 }, { "type": "loss", "content": 0.02276870794594288, "timestamp": "2025-10-01 04:11:17.042121", "step": 345, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:17.088648", "step": 345, "epoch": 1 }, { "type": "loss", "content": 0.02521303854882717, "timestamp": "2025-10-01 04:11:17.095138", "step": 346, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:17.150265", "step": 346, "epoch": 1 }, { "type": "loss", "content": 0.0223286934196949, "timestamp": "2025-10-01 04:11:17.161778", "step": 347, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.224552", "step": 347, "epoch": 1 }, { "type": "loss", "content": 0.02348393388092518, "timestamp": "2025-10-01 04:11:17.259066", "step": 348, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:17.319123", "step": 348, "epoch": 1 }, { "type": "loss", "content": 0.019929800182580948, "timestamp": "2025-10-01 04:11:17.328584", "step": 349, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.376797", "step": 349, "epoch": 1 }, { "type": "loss", "content": 0.024558162316679955, "timestamp": "2025-10-01 04:11:17.379457", "step": 350, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.415822", "step": 350, "epoch": 1 }, { "type": "loss", "content": 0.027435820549726486, "timestamp": "2025-10-01 04:11:17.419765", "step": 351, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.458325", "step": 351, "epoch": 1 }, { "type": "loss", "content": 0.025386372581124306, "timestamp": "2025-10-01 04:11:17.486734", "step": 352, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.524048", "step": 352, "epoch": 1 }, { "type": "loss", "content": 0.028569668531417847, "timestamp": "2025-10-01 04:11:17.533496", "step": 353, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:17.589555", "step": 353, "epoch": 1 }, { "type": "loss", "content": 0.023457257077097893, "timestamp": "2025-10-01 04:11:17.598781", "step": 354, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.645443", "step": 354, "epoch": 1 }, { "type": "loss", "content": 0.025979798287153244, "timestamp": "2025-10-01 04:11:17.648451", "step": 355, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.688180", "step": 355, "epoch": 1 }, { "type": "loss", "content": 0.01912418194115162, "timestamp": "2025-10-01 04:11:17.718475", "step": 356, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:17.764883", "step": 356, "epoch": 1 }, { "type": "loss", "content": 0.020123664289712906, "timestamp": "2025-10-01 04:11:17.768559", "step": 357, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.821188", "step": 357, "epoch": 1 }, { "type": "loss", "content": 0.017067687585949898, "timestamp": "2025-10-01 04:11:17.824440", "step": 358, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:17.865217", "step": 358, "epoch": 1 }, { "type": "loss", "content": 0.015169544145464897, "timestamp": "2025-10-01 04:11:17.876455", "step": 359, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:17.922614", "step": 359, "epoch": 1 }, { "type": "loss", "content": 0.021369751542806625, "timestamp": "2025-10-01 04:11:17.951034", "step": 360, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:17.990032", "step": 360, "epoch": 1 }, { "type": "loss", "content": 0.025677619501948357, "timestamp": "2025-10-01 04:11:17.992682", "step": 361, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.029442", "step": 361, "epoch": 1 }, { "type": "loss", "content": 0.03131221607327461, "timestamp": "2025-10-01 04:11:18.036011", "step": 362, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:18.070878", "step": 362, "epoch": 1 }, { "type": "loss", "content": 0.02442021481692791, "timestamp": "2025-10-01 04:11:18.077546", "step": 363, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.115727", "step": 363, "epoch": 1 }, { "type": "loss", "content": 0.013035761192440987, "timestamp": "2025-10-01 04:11:18.140492", "step": 364, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.184440", "step": 364, "epoch": 1 }, { "type": "loss", "content": 0.02659951150417328, "timestamp": "2025-10-01 04:11:18.187011", "step": 365, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:18.223164", "step": 365, "epoch": 1 }, { "type": "loss", "content": 0.016170557588338852, "timestamp": "2025-10-01 04:11:18.229705", "step": 366, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:18.274709", "step": 366, "epoch": 1 }, { "type": "loss", "content": 0.018221816048026085, "timestamp": "2025-10-01 04:11:18.282506", "step": 367, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.327114", "step": 367, "epoch": 1 }, { "type": "loss", "content": 0.02146676555275917, "timestamp": "2025-10-01 04:11:18.354184", "step": 368, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.389949", "step": 368, "epoch": 1 }, { "type": "loss", "content": 0.025201361626386642, "timestamp": "2025-10-01 04:11:18.395673", "step": 369, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:18.437111", "step": 369, "epoch": 1 }, { "type": "loss", "content": 0.026593759655952454, "timestamp": "2025-10-01 04:11:18.443651", "step": 370, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.486574", "step": 370, "epoch": 1 }, { "type": "loss", "content": 0.01598348841071129, "timestamp": "2025-10-01 04:11:18.493550", "step": 371, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.535039", "step": 371, "epoch": 1 }, { "type": "loss", "content": 0.030138777568936348, "timestamp": "2025-10-01 04:11:18.562546", "step": 372, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:18.603815", "step": 372, "epoch": 1 }, { "type": "loss", "content": 0.028218790888786316, "timestamp": "2025-10-01 04:11:18.612077", "step": 373, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.652703", "step": 373, "epoch": 1 }, { "type": "loss", "content": 0.03160586208105087, "timestamp": "2025-10-01 04:11:18.658428", "step": 374, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.697420", "step": 374, "epoch": 1 }, { "type": "loss", "content": 0.018323153257369995, "timestamp": "2025-10-01 04:11:18.703450", "step": 375, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.742862", "step": 375, "epoch": 1 }, { "type": "loss", "content": 0.020284952595829964, "timestamp": "2025-10-01 04:11:18.768866", "step": 376, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.804798", "step": 376, "epoch": 1 }, { "type": "loss", "content": 0.012562897987663746, "timestamp": "2025-10-01 04:11:18.809945", "step": 377, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.855185", "step": 377, "epoch": 1 }, { "type": "loss", "content": 0.019718678668141365, "timestamp": "2025-10-01 04:11:18.861059", "step": 378, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:18.900339", "step": 378, "epoch": 1 }, { "type": "loss", "content": 0.01936933770775795, "timestamp": "2025-10-01 04:11:18.905467", "step": 379, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:18.951842", "step": 379, "epoch": 1 }, { "type": "loss", "content": 0.022852379828691483, "timestamp": "2025-10-01 04:11:18.978112", "step": 380, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.023384", "step": 380, "epoch": 1 }, { "type": "loss", "content": 0.026843106374144554, "timestamp": "2025-10-01 04:11:19.031828", "step": 381, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:19.076923", "step": 381, "epoch": 1 }, { "type": "loss", "content": 0.019951580092310905, "timestamp": "2025-10-01 04:11:19.086965", "step": 382, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.132695", "step": 382, "epoch": 1 }, { "type": "loss", "content": 0.012562128715217113, "timestamp": "2025-10-01 04:11:19.140474", "step": 383, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:19.181383", "step": 383, "epoch": 1 }, { "type": "loss", "content": 0.020456364378333092, "timestamp": "2025-10-01 04:11:19.210285", "step": 384, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:19.248051", "step": 384, "epoch": 1 }, { "type": "loss", "content": 0.015333155170083046, "timestamp": "2025-10-01 04:11:19.253166", "step": 385, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.295231", "step": 385, "epoch": 1 }, { "type": "loss", "content": 0.018947962671518326, "timestamp": "2025-10-01 04:11:19.301013", "step": 386, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.337554", "step": 386, "epoch": 1 }, { "type": "loss", "content": 0.018343036994338036, "timestamp": "2025-10-01 04:11:19.342893", "step": 387, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.383521", "step": 387, "epoch": 1 }, { "type": "loss", "content": 0.01990184746682644, "timestamp": "2025-10-01 04:11:19.411725", "step": 388, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:19.457528", "step": 388, "epoch": 1 }, { "type": "loss", "content": 0.01581694185733795, "timestamp": "2025-10-01 04:11:19.464236", "step": 389, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.508554", "step": 389, "epoch": 1 }, { "type": "loss", "content": 0.02895066700875759, "timestamp": "2025-10-01 04:11:19.520622", "step": 390, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.563260", "step": 390, "epoch": 1 }, { "type": "loss", "content": 0.034577902406454086, "timestamp": "2025-10-01 04:11:19.570058", "step": 391, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.620479", "step": 391, "epoch": 1 }, { "type": "loss", "content": 0.01622415892779827, "timestamp": "2025-10-01 04:11:19.648093", "step": 392, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.688188", "step": 392, "epoch": 1 }, { "type": "loss", "content": 0.017028475180268288, "timestamp": "2025-10-01 04:11:19.692937", "step": 393, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:19.732699", "step": 393, "epoch": 1 }, { "type": "loss", "content": 0.021526996046304703, "timestamp": "2025-10-01 04:11:19.738216", "step": 394, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.781318", "step": 394, "epoch": 1 }, { "type": "loss", "content": 0.011884956620633602, "timestamp": "2025-10-01 04:11:19.785612", "step": 395, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.825853", "step": 395, "epoch": 1 }, { "type": "loss", "content": 0.023782964795827866, "timestamp": "2025-10-01 04:11:19.855267", "step": 396, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:19.904400", "step": 396, "epoch": 1 }, { "type": "loss", "content": 0.011757438071072102, "timestamp": "2025-10-01 04:11:19.915780", "step": 397, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:19.963058", "step": 397, "epoch": 1 }, { "type": "loss", "content": 0.01555390004068613, "timestamp": "2025-10-01 04:11:19.969003", "step": 398, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:20.017948", "step": 398, "epoch": 1 }, { "type": "loss", "content": 0.023047687485814095, "timestamp": "2025-10-01 04:11:20.020359", "step": 399, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:21.209850", "step": 399, "epoch": 1 }, { "type": "pplx", "content": 122024603.38308345, "timestamp": "2025-10-01 04:11:21.217118", "step": 399, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.254869", "step": 399, "epoch": 1 }, { "type": "loss", "content": 0.01790771260857582, "timestamp": "2025-10-01 04:11:21.284616", "step": 400, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.336133", "step": 400, "epoch": 1 }, { "type": "loss", "content": 0.011960442177951336, "timestamp": "2025-10-01 04:11:21.344495", "step": 401, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:21.395634", "step": 401, "epoch": 1 }, { "type": "loss", "content": 0.01990598626434803, "timestamp": "2025-10-01 04:11:21.406115", "step": 402, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:21.452092", "step": 402, "epoch": 1 }, { "type": "loss", "content": 0.020694376900792122, "timestamp": "2025-10-01 04:11:21.458386", "step": 403, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.512984", "step": 403, "epoch": 1 }, { "type": "loss", "content": 0.029738137498497963, "timestamp": "2025-10-01 04:11:21.544812", "step": 404, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:21.594232", "step": 404, "epoch": 1 }, { "type": "loss", "content": 0.027073880657553673, "timestamp": "2025-10-01 04:11:21.602336", "step": 405, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:21.653692", "step": 405, "epoch": 1 }, { "type": "loss", "content": 0.03174943849444389, "timestamp": "2025-10-01 04:11:21.663507", "step": 406, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.714786", "step": 406, "epoch": 1 }, { "type": "loss", "content": 0.014953884296119213, "timestamp": "2025-10-01 04:11:21.722188", "step": 407, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:21.771943", "step": 407, "epoch": 1 }, { "type": "loss", "content": 0.012122013606131077, "timestamp": "2025-10-01 04:11:21.805997", "step": 408, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.844584", "step": 408, "epoch": 1 }, { "type": "loss", "content": 0.020316915586590767, "timestamp": "2025-10-01 04:11:21.854618", "step": 409, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:21.907373", "step": 409, "epoch": 1 }, { "type": "loss", "content": 0.021981576457619667, "timestamp": "2025-10-01 04:11:21.918327", "step": 410, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:21.966125", "step": 410, "epoch": 1 }, { "type": "loss", "content": 0.018987832590937614, "timestamp": "2025-10-01 04:11:21.973583", "step": 411, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:22.021265", "step": 411, "epoch": 1 }, { "type": "loss", "content": 0.031161749735474586, "timestamp": "2025-10-01 04:11:22.046053", "step": 412, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.096071", "step": 412, "epoch": 1 }, { "type": "loss", "content": 0.015418009832501411, "timestamp": "2025-10-01 04:11:22.103458", "step": 413, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.155537", "step": 413, "epoch": 1 }, { "type": "loss", "content": 0.030573803931474686, "timestamp": "2025-10-01 04:11:22.163064", "step": 414, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.207582", "step": 414, "epoch": 1 }, { "type": "loss", "content": 0.03835904970765114, "timestamp": "2025-10-01 04:11:22.214062", "step": 415, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.261659", "step": 415, "epoch": 1 }, { "type": "loss", "content": 0.01001194678246975, "timestamp": "2025-10-01 04:11:22.292483", "step": 416, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.342012", "step": 416, "epoch": 1 }, { "type": "loss", "content": 0.025580940768122673, "timestamp": "2025-10-01 04:11:22.351194", "step": 417, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.397964", "step": 417, "epoch": 1 }, { "type": "loss", "content": 0.02997286431491375, "timestamp": "2025-10-01 04:11:22.408030", "step": 418, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.463339", "step": 418, "epoch": 1 }, { "type": "loss", "content": 0.021901780739426613, "timestamp": "2025-10-01 04:11:22.469058", "step": 419, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:22.513040", "step": 419, "epoch": 1 }, { "type": "loss", "content": 0.016856132075190544, "timestamp": "2025-10-01 04:11:22.541490", "step": 420, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.588021", "step": 420, "epoch": 1 }, { "type": "loss", "content": 0.011998930014669895, "timestamp": "2025-10-01 04:11:22.599767", "step": 421, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:22.664805", "step": 421, "epoch": 1 }, { "type": "loss", "content": 0.011346760205924511, "timestamp": "2025-10-01 04:11:22.673954", "step": 422, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:22.723750", "step": 422, "epoch": 1 }, { "type": "loss", "content": 0.01255186740309, "timestamp": "2025-10-01 04:11:22.726544", "step": 423, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:22.773621", "step": 423, "epoch": 1 }, { "type": "loss", "content": 0.021336868405342102, "timestamp": "2025-10-01 04:11:22.804610", "step": 424, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:22.859929", "step": 424, "epoch": 1 }, { "type": "loss", "content": 0.0202977005392313, "timestamp": "2025-10-01 04:11:22.871066", "step": 425, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:22.924978", "step": 425, "epoch": 1 }, { "type": "loss", "content": 0.030498603358864784, "timestamp": "2025-10-01 04:11:22.928692", "step": 426, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:22.976022", "step": 426, "epoch": 1 }, { "type": "loss", "content": 0.02014249376952648, "timestamp": "2025-10-01 04:11:22.987075", "step": 427, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.040232", "step": 427, "epoch": 1 }, { "type": "loss", "content": 0.02097696252167225, "timestamp": "2025-10-01 04:11:23.072232", "step": 428, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.123792", "step": 428, "epoch": 1 }, { "type": "loss", "content": 0.038673240691423416, "timestamp": "2025-10-01 04:11:23.127105", "step": 429, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:23.174343", "step": 429, "epoch": 1 }, { "type": "loss", "content": 0.01677296869456768, "timestamp": "2025-10-01 04:11:23.182653", "step": 430, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.231806", "step": 430, "epoch": 1 }, { "type": "loss", "content": 0.012115752324461937, "timestamp": "2025-10-01 04:11:23.241274", "step": 431, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:23.289502", "step": 431, "epoch": 1 }, { "type": "loss", "content": 0.019276294857263565, "timestamp": "2025-10-01 04:11:23.318581", "step": 432, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.358622", "step": 432, "epoch": 1 }, { "type": "loss", "content": 0.024136707186698914, "timestamp": "2025-10-01 04:11:23.368852", "step": 433, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:23.421813", "step": 433, "epoch": 1 }, { "type": "loss", "content": 0.02334047667682171, "timestamp": "2025-10-01 04:11:23.427072", "step": 434, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.472575", "step": 434, "epoch": 1 }, { "type": "loss", "content": 0.020124908536672592, "timestamp": "2025-10-01 04:11:23.482096", "step": 435, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.535856", "step": 435, "epoch": 1 }, { "type": "loss", "content": 0.018802836537361145, "timestamp": "2025-10-01 04:11:23.568582", "step": 436, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.618627", "step": 436, "epoch": 1 }, { "type": "loss", "content": 0.023524196818470955, "timestamp": "2025-10-01 04:11:23.622349", "step": 437, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.657160", "step": 437, "epoch": 1 }, { "type": "loss", "content": 0.023192165419459343, "timestamp": "2025-10-01 04:11:23.665523", "step": 438, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.707024", "step": 438, "epoch": 1 }, { "type": "loss", "content": 0.03054480254650116, "timestamp": "2025-10-01 04:11:23.713306", "step": 439, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.762565", "step": 439, "epoch": 1 }, { "type": "loss", "content": 0.021388614550232887, "timestamp": "2025-10-01 04:11:23.790395", "step": 440, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.835924", "step": 440, "epoch": 1 }, { "type": "loss", "content": 0.029041411355137825, "timestamp": "2025-10-01 04:11:23.843332", "step": 441, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:23.889330", "step": 441, "epoch": 1 }, { "type": "loss", "content": 0.027110761031508446, "timestamp": "2025-10-01 04:11:23.896453", "step": 442, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:23.940065", "step": 442, "epoch": 1 }, { "type": "loss", "content": 0.0177504513412714, "timestamp": "2025-10-01 04:11:23.947262", "step": 443, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:23.996658", "step": 443, "epoch": 1 }, { "type": "loss", "content": 0.02896556630730629, "timestamp": "2025-10-01 04:11:24.025450", "step": 444, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.069273", "step": 444, "epoch": 1 }, { "type": "loss", "content": 0.02039783075451851, "timestamp": "2025-10-01 04:11:24.075484", "step": 445, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.120500", "step": 445, "epoch": 1 }, { "type": "loss", "content": 0.021455178037285805, "timestamp": "2025-10-01 04:11:24.123772", "step": 446, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:24.160550", "step": 446, "epoch": 1 }, { "type": "loss", "content": 0.01578565128147602, "timestamp": "2025-10-01 04:11:24.167380", "step": 447, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.213798", "step": 447, "epoch": 1 }, { "type": "loss", "content": 0.019582688808441162, "timestamp": "2025-10-01 04:11:24.246017", "step": 448, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:24.290111", "step": 448, "epoch": 1 }, { "type": "loss", "content": 0.017545174807310104, "timestamp": "2025-10-01 04:11:24.297889", "step": 449, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.338911", "step": 449, "epoch": 1 }, { "type": "loss", "content": 0.02778690680861473, "timestamp": "2025-10-01 04:11:24.345652", "step": 450, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.392748", "step": 450, "epoch": 1 }, { "type": "loss", "content": 0.020310161635279655, "timestamp": "2025-10-01 04:11:24.399949", "step": 451, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.443342", "step": 451, "epoch": 1 }, { "type": "loss", "content": 0.01537756435573101, "timestamp": "2025-10-01 04:11:24.474569", "step": 452, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.523173", "step": 452, "epoch": 1 }, { "type": "loss", "content": 0.02046174742281437, "timestamp": "2025-10-01 04:11:24.528769", "step": 453, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:24.576663", "step": 453, "epoch": 1 }, { "type": "loss", "content": 0.02453591488301754, "timestamp": "2025-10-01 04:11:24.584379", "step": 454, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:24.628219", "step": 454, "epoch": 1 }, { "type": "loss", "content": 0.021878505125641823, "timestamp": "2025-10-01 04:11:24.631028", "step": 455, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:24.678347", "step": 455, "epoch": 1 }, { "type": "loss", "content": 0.018454913049936295, "timestamp": "2025-10-01 04:11:24.711161", "step": 456, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:26.221328", "step": 456, "epoch": 1 }, { "type": "pplx", "content": 130055133.76006038, "timestamp": "2025-10-01 04:11:26.230991", "step": 456, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.272793", "step": 456, "epoch": 1 }, { "type": "loss", "content": 0.022608721628785133, "timestamp": "2025-10-01 04:11:26.282629", "step": 457, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:26.335087", "step": 457, "epoch": 1 }, { "type": "loss", "content": 0.0280974842607975, "timestamp": "2025-10-01 04:11:26.345856", "step": 458, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.407223", "step": 458, "epoch": 1 }, { "type": "loss", "content": 0.017616761848330498, "timestamp": "2025-10-01 04:11:26.420275", "step": 459, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:26.479397", "step": 459, "epoch": 1 }, { "type": "loss", "content": 0.0306308064609766, "timestamp": "2025-10-01 04:11:26.511368", "step": 460, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.562366", "step": 460, "epoch": 1 }, { "type": "loss", "content": 0.02061159536242485, "timestamp": "2025-10-01 04:11:26.573676", "step": 461, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.628336", "step": 461, "epoch": 1 }, { "type": "loss", "content": 0.015530804172158241, "timestamp": "2025-10-01 04:11:26.631710", "step": 462, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.679849", "step": 462, "epoch": 1 }, { "type": "loss", "content": 0.011279831640422344, "timestamp": "2025-10-01 04:11:26.690186", "step": 463, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.743968", "step": 463, "epoch": 1 }, { "type": "loss", "content": 0.019206982105970383, "timestamp": "2025-10-01 04:11:26.768849", "step": 464, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.812608", "step": 464, "epoch": 1 }, { "type": "loss", "content": 0.021167373284697533, "timestamp": "2025-10-01 04:11:26.817101", "step": 465, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.876645", "step": 465, "epoch": 1 }, { "type": "loss", "content": 0.017267005518078804, "timestamp": "2025-10-01 04:11:26.891235", "step": 466, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:26.951176", "step": 466, "epoch": 1 }, { "type": "loss", "content": 0.024158697575330734, "timestamp": "2025-10-01 04:11:26.962773", "step": 467, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.020341", "step": 467, "epoch": 1 }, { "type": "loss", "content": 0.02584967017173767, "timestamp": "2025-10-01 04:11:27.053211", "step": 468, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.107640", "step": 468, "epoch": 1 }, { "type": "loss", "content": 0.02439996600151062, "timestamp": "2025-10-01 04:11:27.118767", "step": 469, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.176595", "step": 469, "epoch": 1 }, { "type": "loss", "content": 0.029676523059606552, "timestamp": "2025-10-01 04:11:27.191014", "step": 470, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.261187", "step": 470, "epoch": 1 }, { "type": "loss", "content": 0.011198568157851696, "timestamp": "2025-10-01 04:11:27.266160", "step": 471, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.317368", "step": 471, "epoch": 1 }, { "type": "loss", "content": 0.018555212765932083, "timestamp": "2025-10-01 04:11:27.353277", "step": 472, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:27.412642", "step": 472, "epoch": 1 }, { "type": "loss", "content": 0.01946217007935047, "timestamp": "2025-10-01 04:11:27.423749", "step": 473, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:27.482034", "step": 473, "epoch": 1 }, { "type": "loss", "content": 0.02705521322786808, "timestamp": "2025-10-01 04:11:27.493430", "step": 474, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:27.545635", "step": 474, "epoch": 1 }, { "type": "loss", "content": 0.015881655737757683, "timestamp": "2025-10-01 04:11:27.551250", "step": 475, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.602855", "step": 475, "epoch": 1 }, { "type": "loss", "content": 0.015143660828471184, "timestamp": "2025-10-01 04:11:27.636273", "step": 476, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.689037", "step": 476, "epoch": 1 }, { "type": "loss", "content": 0.016437551006674767, "timestamp": "2025-10-01 04:11:27.702284", "step": 477, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.766160", "step": 477, "epoch": 1 }, { "type": "loss", "content": 0.024245990440249443, "timestamp": "2025-10-01 04:11:27.778044", "step": 478, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.841867", "step": 478, "epoch": 1 }, { "type": "loss", "content": 0.015856830403208733, "timestamp": "2025-10-01 04:11:27.854357", "step": 479, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:27.918230", "step": 479, "epoch": 1 }, { "type": "loss", "content": 0.006872235331684351, "timestamp": "2025-10-01 04:11:27.952875", "step": 480, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.010340", "step": 480, "epoch": 1 }, { "type": "loss", "content": 0.01868906430900097, "timestamp": "2025-10-01 04:11:28.023917", "step": 481, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.087688", "step": 481, "epoch": 1 }, { "type": "loss", "content": 0.0282533448189497, "timestamp": "2025-10-01 04:11:28.097800", "step": 482, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:28.149950", "step": 482, "epoch": 1 }, { "type": "loss", "content": 0.017546121031045914, "timestamp": "2025-10-01 04:11:28.159246", "step": 483, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.204649", "step": 483, "epoch": 1 }, { "type": "loss", "content": 0.010689259506762028, "timestamp": "2025-10-01 04:11:28.236264", "step": 484, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:28.286274", "step": 484, "epoch": 1 }, { "type": "loss", "content": 0.017749961465597153, "timestamp": "2025-10-01 04:11:28.296031", "step": 485, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.349267", "step": 485, "epoch": 1 }, { "type": "loss", "content": 0.019139355048537254, "timestamp": "2025-10-01 04:11:28.357242", "step": 486, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:28.409349", "step": 486, "epoch": 1 }, { "type": "loss", "content": 0.035080693662166595, "timestamp": "2025-10-01 04:11:28.416474", "step": 487, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.467784", "step": 487, "epoch": 1 }, { "type": "loss", "content": 0.02607973851263523, "timestamp": "2025-10-01 04:11:28.498015", "step": 488, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.553625", "step": 488, "epoch": 1 }, { "type": "loss", "content": 0.01762440800666809, "timestamp": "2025-10-01 04:11:28.565257", "step": 489, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:28.618742", "step": 489, "epoch": 1 }, { "type": "loss", "content": 0.015824036672711372, "timestamp": "2025-10-01 04:11:28.629979", "step": 490, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.676386", "step": 490, "epoch": 1 }, { "type": "loss", "content": 0.0184563510119915, "timestamp": "2025-10-01 04:11:28.687589", "step": 491, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:28.739797", "step": 491, "epoch": 1 }, { "type": "loss", "content": 0.014332951046526432, "timestamp": "2025-10-01 04:11:28.771525", "step": 492, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.823705", "step": 492, "epoch": 1 }, { "type": "loss", "content": 0.013719163835048676, "timestamp": "2025-10-01 04:11:28.831465", "step": 493, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.889022", "step": 493, "epoch": 1 }, { "type": "loss", "content": 0.03325566649436951, "timestamp": "2025-10-01 04:11:28.892931", "step": 494, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.927265", "step": 494, "epoch": 1 }, { "type": "loss", "content": 0.021148711442947388, "timestamp": "2025-10-01 04:11:28.931412", "step": 495, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:28.974937", "step": 495, "epoch": 1 }, { "type": "loss", "content": 0.02531592920422554, "timestamp": "2025-10-01 04:11:29.005500", "step": 496, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:29.054608", "step": 496, "epoch": 1 }, { "type": "loss", "content": 0.01008320041000843, "timestamp": "2025-10-01 04:11:29.063165", "step": 497, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:29.109695", "step": 497, "epoch": 1 }, { "type": "loss", "content": 0.021790364757180214, "timestamp": "2025-10-01 04:11:29.118405", "step": 498, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:29.174090", "step": 498, "epoch": 1 }, { "type": "loss", "content": 0.02201670967042446, "timestamp": "2025-10-01 04:11:29.182176", "step": 499, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:29.229338", "step": 499, "epoch": 1 }, { "type": "loss", "content": 0.013443304225802422, "timestamp": "2025-10-01 04:11:29.264043", "step": 500, "epoch": 1 }, { "type": "info", "content": "Checkpoint saved at step 500", "timestamp": "2025-10-01 04:11:34.173273", "step": 500, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.231112", "step": 500, "epoch": 1 }, { "type": "loss", "content": 0.012881094589829445, "timestamp": "2025-10-01 04:11:34.244068", "step": 501, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:34.306018", "step": 501, "epoch": 1 }, { "type": "loss", "content": 0.026178503409028053, "timestamp": "2025-10-01 04:11:34.311623", "step": 502, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.363636", "step": 502, "epoch": 1 }, { "type": "loss", "content": 0.02012251317501068, "timestamp": "2025-10-01 04:11:34.372496", "step": 503, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.424270", "step": 503, "epoch": 1 }, { "type": "loss", "content": 0.022417036816477776, "timestamp": "2025-10-01 04:11:34.455470", "step": 504, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.500142", "step": 504, "epoch": 1 }, { "type": "loss", "content": 0.0069273607805371284, "timestamp": "2025-10-01 04:11:34.509824", "step": 505, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.562129", "step": 505, "epoch": 1 }, { "type": "loss", "content": 0.03598469868302345, "timestamp": "2025-10-01 04:11:34.570327", "step": 506, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:34.619445", "step": 506, "epoch": 1 }, { "type": "loss", "content": 0.02659173123538494, "timestamp": "2025-10-01 04:11:34.623475", "step": 507, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:34.664081", "step": 507, "epoch": 1 }, { "type": "loss", "content": 0.02742273546755314, "timestamp": "2025-10-01 04:11:34.695844", "step": 508, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.735857", "step": 508, "epoch": 1 }, { "type": "loss", "content": 0.014712974429130554, "timestamp": "2025-10-01 04:11:34.739445", "step": 509, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.782424", "step": 509, "epoch": 1 }, { "type": "loss", "content": 0.012078308500349522, "timestamp": "2025-10-01 04:11:34.786894", "step": 510, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.827497", "step": 510, "epoch": 1 }, { "type": "loss", "content": 0.028882889077067375, "timestamp": "2025-10-01 04:11:34.832141", "step": 511, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:34.884158", "step": 511, "epoch": 1 }, { "type": "loss", "content": 0.010723995044827461, "timestamp": "2025-10-01 04:11:34.916416", "step": 512, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:34.968133", "step": 512, "epoch": 1 }, { "type": "loss", "content": 0.03016284666955471, "timestamp": "2025-10-01 04:11:34.976869", "step": 513, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:36.604281", "step": 513, "epoch": 1 }, { "type": "pplx", "content": 144971613.22876847, "timestamp": "2025-10-01 04:11:36.616119", "step": 513, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:36.663179", "step": 513, "epoch": 1 }, { "type": "loss", "content": 0.009461253881454468, "timestamp": "2025-10-01 04:11:36.674355", "step": 514, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:36.725412", "step": 514, "epoch": 1 }, { "type": "loss", "content": 0.0260606799274683, "timestamp": "2025-10-01 04:11:36.737766", "step": 515, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:36.792608", "step": 515, "epoch": 1 }, { "type": "loss", "content": 0.011063075624406338, "timestamp": "2025-10-01 04:11:36.820150", "step": 516, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:36.866006", "step": 516, "epoch": 1 }, { "type": "loss", "content": 0.02779548428952694, "timestamp": "2025-10-01 04:11:36.875604", "step": 517, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:36.933550", "step": 517, "epoch": 1 }, { "type": "loss", "content": 0.02797873318195343, "timestamp": "2025-10-01 04:11:36.942350", "step": 518, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:36.986118", "step": 518, "epoch": 1 }, { "type": "loss", "content": 0.030544089153409004, "timestamp": "2025-10-01 04:11:36.990498", "step": 519, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.032133", "step": 519, "epoch": 1 }, { "type": "loss", "content": 0.01135441567748785, "timestamp": "2025-10-01 04:11:37.062897", "step": 520, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.109278", "step": 520, "epoch": 1 }, { "type": "loss", "content": 0.02531917579472065, "timestamp": "2025-10-01 04:11:37.113661", "step": 521, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.159294", "step": 521, "epoch": 1 }, { "type": "loss", "content": 0.030459651723504066, "timestamp": "2025-10-01 04:11:37.163379", "step": 522, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.202923", "step": 522, "epoch": 1 }, { "type": "loss", "content": 0.012750692665576935, "timestamp": "2025-10-01 04:11:37.206857", "step": 523, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.258203", "step": 523, "epoch": 1 }, { "type": "loss", "content": 0.005090842954814434, "timestamp": "2025-10-01 04:11:37.290681", "step": 524, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.338819", "step": 524, "epoch": 1 }, { "type": "loss", "content": 0.013615354895591736, "timestamp": "2025-10-01 04:11:37.352708", "step": 525, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.404254", "step": 525, "epoch": 1 }, { "type": "loss", "content": 0.012541785836219788, "timestamp": "2025-10-01 04:11:37.415637", "step": 526, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.465835", "step": 526, "epoch": 1 }, { "type": "loss", "content": 0.013080544769763947, "timestamp": "2025-10-01 04:11:37.477060", "step": 527, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.526447", "step": 527, "epoch": 1 }, { "type": "loss", "content": 0.015069060027599335, "timestamp": "2025-10-01 04:11:37.558216", "step": 528, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.611158", "step": 528, "epoch": 1 }, { "type": "loss", "content": 0.016289498656988144, "timestamp": "2025-10-01 04:11:37.615975", "step": 529, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.668954", "step": 529, "epoch": 1 }, { "type": "loss", "content": 0.013955731876194477, "timestamp": "2025-10-01 04:11:37.677874", "step": 530, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.729797", "step": 530, "epoch": 1 }, { "type": "loss", "content": 0.025528011843562126, "timestamp": "2025-10-01 04:11:37.742663", "step": 531, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.785845", "step": 531, "epoch": 1 }, { "type": "loss", "content": 0.01630363240838051, "timestamp": "2025-10-01 04:11:37.817789", "step": 532, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:37.862583", "step": 532, "epoch": 1 }, { "type": "loss", "content": 0.025512894615530968, "timestamp": "2025-10-01 04:11:37.865298", "step": 533, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.918389", "step": 533, "epoch": 1 }, { "type": "loss", "content": 0.030232040211558342, "timestamp": "2025-10-01 04:11:37.927858", "step": 534, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:37.968421", "step": 534, "epoch": 1 }, { "type": "loss", "content": 0.026811758056282997, "timestamp": "2025-10-01 04:11:37.977836", "step": 535, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.028632", "step": 535, "epoch": 1 }, { "type": "loss", "content": 0.011201995424926281, "timestamp": "2025-10-01 04:11:38.053083", "step": 536, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.092867", "step": 536, "epoch": 1 }, { "type": "loss", "content": 0.03314945474267006, "timestamp": "2025-10-01 04:11:38.099522", "step": 537, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.142433", "step": 537, "epoch": 1 }, { "type": "loss", "content": 0.014284268952906132, "timestamp": "2025-10-01 04:11:38.145838", "step": 538, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.186400", "step": 538, "epoch": 1 }, { "type": "loss", "content": 0.04397100955247879, "timestamp": "2025-10-01 04:11:38.193458", "step": 539, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.237954", "step": 539, "epoch": 1 }, { "type": "loss", "content": 0.021980592980980873, "timestamp": "2025-10-01 04:11:38.265054", "step": 540, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.311808", "step": 540, "epoch": 1 }, { "type": "loss", "content": 0.03475477546453476, "timestamp": "2025-10-01 04:11:38.320167", "step": 541, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.359102", "step": 541, "epoch": 1 }, { "type": "loss", "content": 0.019862128421664238, "timestamp": "2025-10-01 04:11:38.368213", "step": 542, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.406858", "step": 542, "epoch": 1 }, { "type": "loss", "content": 0.01110259909182787, "timestamp": "2025-10-01 04:11:38.409661", "step": 543, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.448788", "step": 543, "epoch": 1 }, { "type": "loss", "content": 0.015918336808681488, "timestamp": "2025-10-01 04:11:38.474360", "step": 544, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.512986", "step": 544, "epoch": 1 }, { "type": "loss", "content": 0.02247166447341442, "timestamp": "2025-10-01 04:11:38.516924", "step": 545, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.553224", "step": 545, "epoch": 1 }, { "type": "loss", "content": 0.02808975800871849, "timestamp": "2025-10-01 04:11:38.557013", "step": 546, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.592106", "step": 546, "epoch": 1 }, { "type": "loss", "content": 0.03257390111684799, "timestamp": "2025-10-01 04:11:38.595029", "step": 547, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:38.631955", "step": 547, "epoch": 1 }, { "type": "loss", "content": 0.020895110443234444, "timestamp": "2025-10-01 04:11:38.657815", "step": 548, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.692790", "step": 548, "epoch": 1 }, { "type": "loss", "content": 0.03609849140048027, "timestamp": "2025-10-01 04:11:38.698190", "step": 549, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.734437", "step": 549, "epoch": 1 }, { "type": "loss", "content": 0.03249197080731392, "timestamp": "2025-10-01 04:11:38.738953", "step": 550, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.775635", "step": 550, "epoch": 1 }, { "type": "loss", "content": 0.02835281565785408, "timestamp": "2025-10-01 04:11:38.782143", "step": 551, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.827829", "step": 551, "epoch": 1 }, { "type": "loss", "content": 0.03946463018655777, "timestamp": "2025-10-01 04:11:38.856539", "step": 552, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:38.915006", "step": 552, "epoch": 1 }, { "type": "loss", "content": 0.020388005301356316, "timestamp": "2025-10-01 04:11:38.919534", "step": 553, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:38.965187", "step": 553, "epoch": 1 }, { "type": "loss", "content": 0.02258807048201561, "timestamp": "2025-10-01 04:11:38.975571", "step": 554, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.035104", "step": 554, "epoch": 1 }, { "type": "loss", "content": 0.01330896932631731, "timestamp": "2025-10-01 04:11:39.041654", "step": 555, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:39.097170", "step": 555, "epoch": 1 }, { "type": "loss", "content": 0.026032237336039543, "timestamp": "2025-10-01 04:11:39.122580", "step": 556, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:39.173247", "step": 556, "epoch": 1 }, { "type": "loss", "content": 0.023419994860887527, "timestamp": "2025-10-01 04:11:39.184804", "step": 557, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:39.232931", "step": 557, "epoch": 1 }, { "type": "loss", "content": 0.016865305602550507, "timestamp": "2025-10-01 04:11:39.240914", "step": 558, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:39.283719", "step": 558, "epoch": 1 }, { "type": "loss", "content": 0.018437618389725685, "timestamp": "2025-10-01 04:11:39.294422", "step": 559, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:39.347342", "step": 559, "epoch": 1 }, { "type": "loss", "content": 0.01190783642232418, "timestamp": "2025-10-01 04:11:39.375401", "step": 560, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.424333", "step": 560, "epoch": 1 }, { "type": "loss", "content": 0.01879892125725746, "timestamp": "2025-10-01 04:11:39.427746", "step": 561, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.490886", "step": 561, "epoch": 1 }, { "type": "loss", "content": 0.014638060703873634, "timestamp": "2025-10-01 04:11:39.503826", "step": 562, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.559223", "step": 562, "epoch": 1 }, { "type": "loss", "content": 0.015171224251389503, "timestamp": "2025-10-01 04:11:39.571749", "step": 563, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.615752", "step": 563, "epoch": 1 }, { "type": "loss", "content": 0.030538225546479225, "timestamp": "2025-10-01 04:11:39.644445", "step": 564, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:39.697071", "step": 564, "epoch": 1 }, { "type": "loss", "content": 0.02481764741241932, "timestamp": "2025-10-01 04:11:39.699689", "step": 565, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.741554", "step": 565, "epoch": 1 }, { "type": "loss", "content": 0.021298576146364212, "timestamp": "2025-10-01 04:11:39.746909", "step": 566, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.788689", "step": 566, "epoch": 1 }, { "type": "loss", "content": 0.015441848896443844, "timestamp": "2025-10-01 04:11:39.794697", "step": 567, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:39.833843", "step": 567, "epoch": 1 }, { "type": "loss", "content": 0.023940859362483025, "timestamp": "2025-10-01 04:11:39.861606", "step": 568, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.902005", "step": 568, "epoch": 1 }, { "type": "loss", "content": 0.030663734301924706, "timestamp": "2025-10-01 04:11:39.908904", "step": 569, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:39.952564", "step": 569, "epoch": 1 }, { "type": "loss", "content": 0.01633959636092186, "timestamp": "2025-10-01 04:11:39.958638", "step": 570, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:41.224592", "step": 570, "epoch": 1 }, { "type": "pplx", "content": 146728730.95887718, "timestamp": "2025-10-01 04:11:41.228038", "step": 570, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.259141", "step": 570, "epoch": 1 }, { "type": "loss", "content": 0.014267205260694027, "timestamp": "2025-10-01 04:11:41.263819", "step": 571, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.300716", "step": 571, "epoch": 1 }, { "type": "loss", "content": 0.017848124727606773, "timestamp": "2025-10-01 04:11:41.331862", "step": 572, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:41.385110", "step": 572, "epoch": 1 }, { "type": "loss", "content": 0.029249539598822594, "timestamp": "2025-10-01 04:11:41.392861", "step": 573, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.446810", "step": 573, "epoch": 1 }, { "type": "loss", "content": 0.019859248772263527, "timestamp": "2025-10-01 04:11:41.450320", "step": 574, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.498695", "step": 574, "epoch": 1 }, { "type": "loss", "content": 0.014449072070419788, "timestamp": "2025-10-01 04:11:41.506288", "step": 575, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.556632", "step": 575, "epoch": 1 }, { "type": "loss", "content": 0.00982545968145132, "timestamp": "2025-10-01 04:11:41.591449", "step": 576, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:41.636913", "step": 576, "epoch": 1 }, { "type": "loss", "content": 0.018064500764012337, "timestamp": "2025-10-01 04:11:41.648203", "step": 577, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:41.702802", "step": 577, "epoch": 1 }, { "type": "loss", "content": 0.027665166184306145, "timestamp": "2025-10-01 04:11:41.712337", "step": 578, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.757805", "step": 578, "epoch": 1 }, { "type": "loss", "content": 0.013823381625115871, "timestamp": "2025-10-01 04:11:41.765391", "step": 579, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.811314", "step": 579, "epoch": 1 }, { "type": "loss", "content": 0.01081685908138752, "timestamp": "2025-10-01 04:11:41.842087", "step": 580, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:41.888817", "step": 580, "epoch": 1 }, { "type": "loss", "content": 0.012843619100749493, "timestamp": "2025-10-01 04:11:41.892572", "step": 581, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:41.926267", "step": 581, "epoch": 1 }, { "type": "loss", "content": 0.004551833029836416, "timestamp": "2025-10-01 04:11:41.930554", "step": 582, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:41.978378", "step": 582, "epoch": 1 }, { "type": "loss", "content": 0.022612929344177246, "timestamp": "2025-10-01 04:11:41.989583", "step": 583, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:42.034695", "step": 583, "epoch": 1 }, { "type": "loss", "content": 0.018055619671940804, "timestamp": "2025-10-01 04:11:42.067852", "step": 584, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.123287", "step": 584, "epoch": 1 }, { "type": "loss", "content": 0.0163103099912405, "timestamp": "2025-10-01 04:11:42.128278", "step": 585, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.173776", "step": 585, "epoch": 1 }, { "type": "loss", "content": 0.003515422809869051, "timestamp": "2025-10-01 04:11:42.177904", "step": 586, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.231181", "step": 586, "epoch": 1 }, { "type": "loss", "content": 0.002217690460383892, "timestamp": "2025-10-01 04:11:42.235150", "step": 587, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:42.279365", "step": 587, "epoch": 1 }, { "type": "loss", "content": 0.048120323568582535, "timestamp": "2025-10-01 04:11:42.303808", "step": 588, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.344496", "step": 588, "epoch": 1 }, { "type": "loss", "content": 0.03937225788831711, "timestamp": "2025-10-01 04:11:42.350980", "step": 589, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.394800", "step": 589, "epoch": 1 }, { "type": "loss", "content": 0.026554737240076065, "timestamp": "2025-10-01 04:11:42.401055", "step": 590, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-10-01 04:11:42.456468", "step": 590, "epoch": 1 }, { "type": "loss", "content": 0.03893747180700302, "timestamp": "2025-10-01 04:11:42.461524", "step": 591, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.498399", "step": 591, "epoch": 1 }, { "type": "loss", "content": 0.016101589426398277, "timestamp": "2025-10-01 04:11:42.522332", "step": 592, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.561069", "step": 592, "epoch": 1 }, { "type": "loss", "content": 0.024273639544844627, "timestamp": "2025-10-01 04:11:42.566142", "step": 593, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:42.603836", "step": 593, "epoch": 1 }, { "type": "loss", "content": 0.005769102834165096, "timestamp": "2025-10-01 04:11:42.606126", "step": 594, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.643810", "step": 594, "epoch": 1 }, { "type": "loss", "content": 0.017685530707240105, "timestamp": "2025-10-01 04:11:42.648496", "step": 595, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.689213", "step": 595, "epoch": 1 }, { "type": "loss", "content": 0.03208998963236809, "timestamp": "2025-10-01 04:11:42.715515", "step": 596, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:42.759915", "step": 596, "epoch": 1 }, { "type": "loss", "content": 0.018433064222335815, "timestamp": "2025-10-01 04:11:42.767772", "step": 597, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.804217", "step": 597, "epoch": 1 }, { "type": "loss", "content": 0.019542574882507324, "timestamp": "2025-10-01 04:11:42.810714", "step": 598, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:42.845310", "step": 598, "epoch": 1 }, { "type": "loss", "content": 0.019781693816184998, "timestamp": "2025-10-01 04:11:42.849144", "step": 599, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.887017", "step": 599, "epoch": 1 }, { "type": "loss", "content": 0.030181298032402992, "timestamp": "2025-10-01 04:11:42.918198", "step": 600, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:42.956952", "step": 600, "epoch": 1 }, { "type": "loss", "content": 0.04341671243309975, "timestamp": "2025-10-01 04:11:42.966349", "step": 601, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.008213", "step": 601, "epoch": 1 }, { "type": "loss", "content": 0.019647929817438126, "timestamp": "2025-10-01 04:11:43.011942", "step": 602, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.061044", "step": 602, "epoch": 1 }, { "type": "loss", "content": 0.03288035839796066, "timestamp": "2025-10-01 04:11:43.068800", "step": 603, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.114934", "step": 603, "epoch": 1 }, { "type": "loss", "content": 0.012869777157902718, "timestamp": "2025-10-01 04:11:43.140205", "step": 604, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.190453", "step": 604, "epoch": 1 }, { "type": "loss", "content": 0.014938059262931347, "timestamp": "2025-10-01 04:11:43.197875", "step": 605, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:43.243030", "step": 605, "epoch": 1 }, { "type": "loss", "content": 0.020758474245667458, "timestamp": "2025-10-01 04:11:43.254294", "step": 606, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.313300", "step": 606, "epoch": 1 }, { "type": "loss", "content": 0.023509101942181587, "timestamp": "2025-10-01 04:11:43.325382", "step": 607, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:43.379107", "step": 607, "epoch": 1 }, { "type": "loss", "content": 0.012371575459837914, "timestamp": "2025-10-01 04:11:43.404940", "step": 608, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.457173", "step": 608, "epoch": 1 }, { "type": "loss", "content": 0.013531157746911049, "timestamp": "2025-10-01 04:11:43.460037", "step": 609, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:43.509484", "step": 609, "epoch": 1 }, { "type": "loss", "content": 0.009105498902499676, "timestamp": "2025-10-01 04:11:43.513600", "step": 610, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.577249", "step": 610, "epoch": 1 }, { "type": "loss", "content": 0.029772654175758362, "timestamp": "2025-10-01 04:11:43.580799", "step": 611, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.625139", "step": 611, "epoch": 1 }, { "type": "loss", "content": 0.0325927771627903, "timestamp": "2025-10-01 04:11:43.655460", "step": 612, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.712267", "step": 612, "epoch": 1 }, { "type": "loss", "content": 0.01552466582506895, "timestamp": "2025-10-01 04:11:43.715922", "step": 613, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.768814", "step": 613, "epoch": 1 }, { "type": "loss", "content": 0.019661977887153625, "timestamp": "2025-10-01 04:11:43.778483", "step": 614, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.824321", "step": 614, "epoch": 1 }, { "type": "loss", "content": 0.040737394243478775, "timestamp": "2025-10-01 04:11:43.834053", "step": 615, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.881033", "step": 615, "epoch": 1 }, { "type": "loss", "content": 0.010232476517558098, "timestamp": "2025-10-01 04:11:43.909598", "step": 616, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:43.951807", "step": 616, "epoch": 1 }, { "type": "loss", "content": 0.030402900651097298, "timestamp": "2025-10-01 04:11:43.956166", "step": 617, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:43.994275", "step": 617, "epoch": 1 }, { "type": "loss", "content": 0.013743719086050987, "timestamp": "2025-10-01 04:11:43.999529", "step": 618, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.046112", "step": 618, "epoch": 1 }, { "type": "loss", "content": 0.025667309761047363, "timestamp": "2025-10-01 04:11:44.050403", "step": 619, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.087058", "step": 619, "epoch": 1 }, { "type": "loss", "content": 0.008081094361841679, "timestamp": "2025-10-01 04:11:44.113099", "step": 620, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:44.150732", "step": 620, "epoch": 1 }, { "type": "loss", "content": 0.0131992744281888, "timestamp": "2025-10-01 04:11:44.155427", "step": 621, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:44.193956", "step": 621, "epoch": 1 }, { "type": "loss", "content": 0.0235703457146883, "timestamp": "2025-10-01 04:11:44.198410", "step": 622, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.234797", "step": 622, "epoch": 1 }, { "type": "loss", "content": 0.0194076057523489, "timestamp": "2025-10-01 04:11:44.242420", "step": 623, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.293021", "step": 623, "epoch": 1 }, { "type": "loss", "content": 0.053576499223709106, "timestamp": "2025-10-01 04:11:44.322495", "step": 624, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.366256", "step": 624, "epoch": 1 }, { "type": "loss", "content": 0.0185772143304348, "timestamp": "2025-10-01 04:11:44.369297", "step": 625, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.409271", "step": 625, "epoch": 1 }, { "type": "loss", "content": 0.01065006759017706, "timestamp": "2025-10-01 04:11:44.415553", "step": 626, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:44.458647", "step": 626, "epoch": 1 }, { "type": "loss", "content": 0.00875663198530674, "timestamp": "2025-10-01 04:11:44.461503", "step": 627, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:45.699108", "step": 627, "epoch": 1 }, { "type": "pplx", "content": 127842067.85324596, "timestamp": "2025-10-01 04:11:45.704120", "step": 627, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:45.748908", "step": 627, "epoch": 1 }, { "type": "loss", "content": 0.00792365800589323, "timestamp": "2025-10-01 04:11:45.774658", "step": 628, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:45.826805", "step": 628, "epoch": 1 }, { "type": "loss", "content": 0.039805445820093155, "timestamp": "2025-10-01 04:11:45.837193", "step": 629, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:45.884717", "step": 629, "epoch": 1 }, { "type": "loss", "content": 0.0297335684299469, "timestamp": "2025-10-01 04:11:45.888021", "step": 630, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:45.926800", "step": 630, "epoch": 1 }, { "type": "loss", "content": 0.017810344696044922, "timestamp": "2025-10-01 04:11:45.929150", "step": 631, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:45.962978", "step": 631, "epoch": 1 }, { "type": "loss", "content": 0.018673760816454887, "timestamp": "2025-10-01 04:11:45.987846", "step": 632, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.038102", "step": 632, "epoch": 1 }, { "type": "loss", "content": 0.009931197389960289, "timestamp": "2025-10-01 04:11:46.040828", "step": 633, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.084540", "step": 633, "epoch": 1 }, { "type": "loss", "content": 0.012832448817789555, "timestamp": "2025-10-01 04:11:46.093694", "step": 634, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.142397", "step": 634, "epoch": 1 }, { "type": "loss", "content": 0.02591419219970703, "timestamp": "2025-10-01 04:11:46.151771", "step": 635, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.194918", "step": 635, "epoch": 1 }, { "type": "loss", "content": 0.013405759818851948, "timestamp": "2025-10-01 04:11:46.222594", "step": 636, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.272078", "step": 636, "epoch": 1 }, { "type": "loss", "content": 0.008755778893828392, "timestamp": "2025-10-01 04:11:46.283053", "step": 637, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.343153", "step": 637, "epoch": 1 }, { "type": "loss", "content": 0.022010071203112602, "timestamp": "2025-10-01 04:11:46.348247", "step": 638, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.387259", "step": 638, "epoch": 1 }, { "type": "loss", "content": 0.028203662484884262, "timestamp": "2025-10-01 04:11:46.396811", "step": 639, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.452951", "step": 639, "epoch": 1 }, { "type": "loss", "content": 0.00725026847794652, "timestamp": "2025-10-01 04:11:46.485321", "step": 640, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.528735", "step": 640, "epoch": 1 }, { "type": "loss", "content": 0.011902433820068836, "timestamp": "2025-10-01 04:11:46.533869", "step": 641, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.582609", "step": 641, "epoch": 1 }, { "type": "loss", "content": 0.01911485567688942, "timestamp": "2025-10-01 04:11:46.593115", "step": 642, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.652808", "step": 642, "epoch": 1 }, { "type": "loss", "content": 0.014642453752458096, "timestamp": "2025-10-01 04:11:46.663324", "step": 643, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.716292", "step": 643, "epoch": 1 }, { "type": "loss", "content": 0.030542129650712013, "timestamp": "2025-10-01 04:11:46.747460", "step": 644, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.791680", "step": 644, "epoch": 1 }, { "type": "loss", "content": 0.013660822995007038, "timestamp": "2025-10-01 04:11:46.795017", "step": 645, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:46.837949", "step": 645, "epoch": 1 }, { "type": "loss", "content": 0.028145968914031982, "timestamp": "2025-10-01 04:11:46.850772", "step": 646, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.914406", "step": 646, "epoch": 1 }, { "type": "loss", "content": 0.014101878739893436, "timestamp": "2025-10-01 04:11:46.918770", "step": 647, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:46.976185", "step": 647, "epoch": 1 }, { "type": "loss", "content": 0.012868748977780342, "timestamp": "2025-10-01 04:11:47.008560", "step": 648, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:47.062657", "step": 648, "epoch": 1 }, { "type": "loss", "content": 0.013591781258583069, "timestamp": "2025-10-01 04:11:47.075665", "step": 649, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:47.135458", "step": 649, "epoch": 1 }, { "type": "loss", "content": 0.010220631957054138, "timestamp": "2025-10-01 04:11:47.140479", "step": 650, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.205954", "step": 650, "epoch": 1 }, { "type": "loss", "content": 0.017303917557001114, "timestamp": "2025-10-01 04:11:47.219650", "step": 651, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.271695", "step": 651, "epoch": 1 }, { "type": "loss", "content": 0.014156176708638668, "timestamp": "2025-10-01 04:11:47.303711", "step": 652, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.351934", "step": 652, "epoch": 1 }, { "type": "loss", "content": 0.019841428846120834, "timestamp": "2025-10-01 04:11:47.359327", "step": 653, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.403875", "step": 653, "epoch": 1 }, { "type": "loss", "content": 0.02891084924340248, "timestamp": "2025-10-01 04:11:47.411774", "step": 654, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:47.460795", "step": 654, "epoch": 1 }, { "type": "loss", "content": 0.01000217255204916, "timestamp": "2025-10-01 04:11:47.469112", "step": 655, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.522327", "step": 655, "epoch": 1 }, { "type": "loss", "content": 0.007814330980181694, "timestamp": "2025-10-01 04:11:47.549062", "step": 656, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.605789", "step": 656, "epoch": 1 }, { "type": "loss", "content": 0.004295154474675655, "timestamp": "2025-10-01 04:11:47.609624", "step": 657, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:47.660242", "step": 657, "epoch": 1 }, { "type": "loss", "content": 0.021786142140626907, "timestamp": "2025-10-01 04:11:47.665898", "step": 658, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:47.722907", "step": 658, "epoch": 1 }, { "type": "loss", "content": 0.006306357216089964, "timestamp": "2025-10-01 04:11:47.727506", "step": 659, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.787277", "step": 659, "epoch": 1 }, { "type": "loss", "content": 0.012746252119541168, "timestamp": "2025-10-01 04:11:47.812729", "step": 660, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.865566", "step": 660, "epoch": 1 }, { "type": "loss", "content": 0.010227000340819359, "timestamp": "2025-10-01 04:11:47.870772", "step": 661, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.909963", "step": 661, "epoch": 1 }, { "type": "loss", "content": 0.00480778981000185, "timestamp": "2025-10-01 04:11:47.922171", "step": 662, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:47.986343", "step": 662, "epoch": 1 }, { "type": "loss", "content": 0.01734638772904873, "timestamp": "2025-10-01 04:11:47.998855", "step": 663, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:48.046199", "step": 663, "epoch": 1 }, { "type": "loss", "content": 0.017665620893239975, "timestamp": "2025-10-01 04:11:48.080409", "step": 664, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.132670", "step": 664, "epoch": 1 }, { "type": "loss", "content": 0.009129605256021023, "timestamp": "2025-10-01 04:11:48.136471", "step": 665, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.183725", "step": 665, "epoch": 1 }, { "type": "loss", "content": 0.0035495725460350513, "timestamp": "2025-10-01 04:11:48.188771", "step": 666, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.242315", "step": 666, "epoch": 1 }, { "type": "loss", "content": 0.008134677074849606, "timestamp": "2025-10-01 04:11:48.245520", "step": 667, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.293132", "step": 667, "epoch": 1 }, { "type": "loss", "content": 0.00795725453644991, "timestamp": "2025-10-01 04:11:48.329553", "step": 668, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.380905", "step": 668, "epoch": 1 }, { "type": "loss", "content": 0.053504396229982376, "timestamp": "2025-10-01 04:11:48.385393", "step": 669, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:48.435741", "step": 669, "epoch": 1 }, { "type": "loss", "content": 0.05954793840646744, "timestamp": "2025-10-01 04:11:48.447124", "step": 670, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.504910", "step": 670, "epoch": 1 }, { "type": "loss", "content": 0.008816796354949474, "timestamp": "2025-10-01 04:11:48.509743", "step": 671, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.560990", "step": 671, "epoch": 1 }, { "type": "loss", "content": 0.017331920564174652, "timestamp": "2025-10-01 04:11:48.591951", "step": 672, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.642732", "step": 672, "epoch": 1 }, { "type": "loss", "content": 0.02753291465342045, "timestamp": "2025-10-01 04:11:48.652747", "step": 673, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.691722", "step": 673, "epoch": 1 }, { "type": "loss", "content": 0.029163358733057976, "timestamp": "2025-10-01 04:11:48.700026", "step": 674, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.745563", "step": 674, "epoch": 1 }, { "type": "loss", "content": 0.05267322063446045, "timestamp": "2025-10-01 04:11:48.749392", "step": 675, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.801086", "step": 675, "epoch": 1 }, { "type": "loss", "content": 0.01871684566140175, "timestamp": "2025-10-01 04:11:48.827543", "step": 676, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:48.878793", "step": 676, "epoch": 1 }, { "type": "loss", "content": 0.025403184816241264, "timestamp": "2025-10-01 04:11:48.882640", "step": 677, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:48.944070", "step": 677, "epoch": 1 }, { "type": "loss", "content": 0.006062587257474661, "timestamp": "2025-10-01 04:11:48.959017", "step": 678, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:49.009924", "step": 678, "epoch": 1 }, { "type": "loss", "content": 0.005435396917164326, "timestamp": "2025-10-01 04:11:49.019398", "step": 679, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:49.066613", "step": 679, "epoch": 1 }, { "type": "loss", "content": 0.017074231058359146, "timestamp": "2025-10-01 04:11:49.100097", "step": 680, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:49.158526", "step": 680, "epoch": 1 }, { "type": "loss", "content": 0.011236979626119137, "timestamp": "2025-10-01 04:11:49.170680", "step": 681, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:49.220550", "step": 681, "epoch": 1 }, { "type": "loss", "content": 0.01346462219953537, "timestamp": "2025-10-01 04:11:49.229964", "step": 682, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:49.287818", "step": 682, "epoch": 1 }, { "type": "loss", "content": 0.01821001246571541, "timestamp": "2025-10-01 04:11:49.298681", "step": 683, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:49.358282", "step": 683, "epoch": 1 }, { "type": "loss", "content": 0.010340625420212746, "timestamp": "2025-10-01 04:11:49.390650", "step": 684, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:50.690899", "step": 684, "epoch": 1 }, { "type": "pplx", "content": 119262264.46404168, "timestamp": "2025-10-01 04:11:50.699070", "step": 684, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:50.736407", "step": 684, "epoch": 1 }, { "type": "loss", "content": 0.03107767179608345, "timestamp": "2025-10-01 04:11:50.739416", "step": 685, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:50.778891", "step": 685, "epoch": 1 }, { "type": "loss", "content": 0.016809258610010147, "timestamp": "2025-10-01 04:11:50.784183", "step": 686, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:50.821329", "step": 686, "epoch": 1 }, { "type": "loss", "content": 0.007657169364392757, "timestamp": "2025-10-01 04:11:50.823773", "step": 687, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:50.858616", "step": 687, "epoch": 1 }, { "type": "loss", "content": 0.009451446123421192, "timestamp": "2025-10-01 04:11:50.886751", "step": 688, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:50.930086", "step": 688, "epoch": 1 }, { "type": "loss", "content": 0.036196958273649216, "timestamp": "2025-10-01 04:11:50.935044", "step": 689, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:50.973134", "step": 689, "epoch": 1 }, { "type": "loss", "content": 0.011105935089290142, "timestamp": "2025-10-01 04:11:50.975638", "step": 690, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.015717", "step": 690, "epoch": 1 }, { "type": "loss", "content": 0.00982974749058485, "timestamp": "2025-10-01 04:11:51.019017", "step": 691, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.059996", "step": 691, "epoch": 1 }, { "type": "loss", "content": 0.01059667021036148, "timestamp": "2025-10-01 04:11:51.087081", "step": 692, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.119460", "step": 692, "epoch": 1 }, { "type": "loss", "content": 0.01886533759534359, "timestamp": "2025-10-01 04:11:51.124016", "step": 693, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.159842", "step": 693, "epoch": 1 }, { "type": "loss", "content": 0.0357852578163147, "timestamp": "2025-10-01 04:11:51.165771", "step": 694, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.209301", "step": 694, "epoch": 1 }, { "type": "loss", "content": 0.007002557162195444, "timestamp": "2025-10-01 04:11:51.216887", "step": 695, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:51.260131", "step": 695, "epoch": 1 }, { "type": "loss", "content": 0.029862623661756516, "timestamp": "2025-10-01 04:11:51.288062", "step": 696, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.326956", "step": 696, "epoch": 1 }, { "type": "loss", "content": 0.016306012868881226, "timestamp": "2025-10-01 04:11:51.331337", "step": 697, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.366272", "step": 697, "epoch": 1 }, { "type": "loss", "content": 0.015408610925078392, "timestamp": "2025-10-01 04:11:51.374520", "step": 698, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.417648", "step": 698, "epoch": 1 }, { "type": "loss", "content": 0.025938164442777634, "timestamp": "2025-10-01 04:11:51.421588", "step": 699, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.471695", "step": 699, "epoch": 1 }, { "type": "loss", "content": 0.023498162627220154, "timestamp": "2025-10-01 04:11:51.502819", "step": 700, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:51.553242", "step": 700, "epoch": 1 }, { "type": "loss", "content": 0.009221481159329414, "timestamp": "2025-10-01 04:11:51.556404", "step": 701, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.594378", "step": 701, "epoch": 1 }, { "type": "loss", "content": 0.02343042753636837, "timestamp": "2025-10-01 04:11:51.596942", "step": 702, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.636919", "step": 702, "epoch": 1 }, { "type": "loss", "content": 0.015130646526813507, "timestamp": "2025-10-01 04:11:51.643347", "step": 703, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.692594", "step": 703, "epoch": 1 }, { "type": "loss", "content": 0.013389564119279385, "timestamp": "2025-10-01 04:11:51.721853", "step": 704, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:51.775838", "step": 704, "epoch": 1 }, { "type": "loss", "content": 0.019185690209269524, "timestamp": "2025-10-01 04:11:51.779289", "step": 705, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:51.814170", "step": 705, "epoch": 1 }, { "type": "loss", "content": 0.006280092056840658, "timestamp": "2025-10-01 04:11:51.817534", "step": 706, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.872723", "step": 706, "epoch": 1 }, { "type": "loss", "content": 0.006022441666573286, "timestamp": "2025-10-01 04:11:51.881486", "step": 707, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.925097", "step": 707, "epoch": 1 }, { "type": "loss", "content": 0.014056635089218616, "timestamp": "2025-10-01 04:11:51.951983", "step": 708, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:51.995071", "step": 708, "epoch": 1 }, { "type": "loss", "content": 0.014901148155331612, "timestamp": "2025-10-01 04:11:52.000334", "step": 709, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.040164", "step": 709, "epoch": 1 }, { "type": "loss", "content": 0.044664185494184494, "timestamp": "2025-10-01 04:11:52.045037", "step": 710, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:52.085584", "step": 710, "epoch": 1 }, { "type": "loss", "content": 0.02443745732307434, "timestamp": "2025-10-01 04:11:52.090731", "step": 711, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.131077", "step": 711, "epoch": 1 }, { "type": "loss", "content": 0.03858523443341255, "timestamp": "2025-10-01 04:11:52.156477", "step": 712, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.191690", "step": 712, "epoch": 1 }, { "type": "loss", "content": 0.016447898000478745, "timestamp": "2025-10-01 04:11:52.197775", "step": 713, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.239379", "step": 713, "epoch": 1 }, { "type": "loss", "content": 0.010604329407215118, "timestamp": "2025-10-01 04:11:52.243288", "step": 714, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:52.278282", "step": 714, "epoch": 1 }, { "type": "loss", "content": 0.029538193717598915, "timestamp": "2025-10-01 04:11:52.283351", "step": 715, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.319336", "step": 715, "epoch": 1 }, { "type": "loss", "content": 0.040076810866594315, "timestamp": "2025-10-01 04:11:52.346855", "step": 716, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.389234", "step": 716, "epoch": 1 }, { "type": "loss", "content": 0.020004551857709885, "timestamp": "2025-10-01 04:11:52.391875", "step": 717, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.428625", "step": 717, "epoch": 1 }, { "type": "loss", "content": 0.016628902405500412, "timestamp": "2025-10-01 04:11:52.433795", "step": 718, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.473938", "step": 718, "epoch": 1 }, { "type": "loss", "content": 0.012509877793490887, "timestamp": "2025-10-01 04:11:52.476953", "step": 719, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:52.522188", "step": 719, "epoch": 1 }, { "type": "loss", "content": 0.031048806384205818, "timestamp": "2025-10-01 04:11:52.550284", "step": 720, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.594671", "step": 720, "epoch": 1 }, { "type": "loss", "content": 0.005084963981062174, "timestamp": "2025-10-01 04:11:52.597221", "step": 721, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.638860", "step": 721, "epoch": 1 }, { "type": "loss", "content": 0.004975512158125639, "timestamp": "2025-10-01 04:11:52.646864", "step": 722, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.686807", "step": 722, "epoch": 1 }, { "type": "loss", "content": 0.012422831729054451, "timestamp": "2025-10-01 04:11:52.696729", "step": 723, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.742894", "step": 723, "epoch": 1 }, { "type": "loss", "content": 0.02685965597629547, "timestamp": "2025-10-01 04:11:52.770524", "step": 724, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.810267", "step": 724, "epoch": 1 }, { "type": "loss", "content": 0.038959138095378876, "timestamp": "2025-10-01 04:11:52.815399", "step": 725, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:52.855285", "step": 725, "epoch": 1 }, { "type": "loss", "content": 0.021698709577322006, "timestamp": "2025-10-01 04:11:52.858958", "step": 726, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:52.895979", "step": 726, "epoch": 1 }, { "type": "loss", "content": 0.016632195562124252, "timestamp": "2025-10-01 04:11:52.901026", "step": 727, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:52.941132", "step": 727, "epoch": 1 }, { "type": "loss", "content": 0.009156654588878155, "timestamp": "2025-10-01 04:11:52.967868", "step": 728, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.001799", "step": 728, "epoch": 1 }, { "type": "loss", "content": 0.009549522772431374, "timestamp": "2025-10-01 04:11:53.006254", "step": 729, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.041607", "step": 729, "epoch": 1 }, { "type": "loss", "content": 0.03153882548213005, "timestamp": "2025-10-01 04:11:53.050118", "step": 730, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.091813", "step": 730, "epoch": 1 }, { "type": "loss", "content": 0.008350717835128307, "timestamp": "2025-10-01 04:11:53.094476", "step": 731, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.139106", "step": 731, "epoch": 1 }, { "type": "loss", "content": 0.02463681623339653, "timestamp": "2025-10-01 04:11:53.169284", "step": 732, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:53.212469", "step": 732, "epoch": 1 }, { "type": "loss", "content": 0.019675912335515022, "timestamp": "2025-10-01 04:11:53.220057", "step": 733, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:53.256920", "step": 733, "epoch": 1 }, { "type": "loss", "content": 0.01801421120762825, "timestamp": "2025-10-01 04:11:53.267495", "step": 734, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:53.327589", "step": 734, "epoch": 1 }, { "type": "loss", "content": 0.015794653445482254, "timestamp": "2025-10-01 04:11:53.335394", "step": 735, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.374717", "step": 735, "epoch": 1 }, { "type": "loss", "content": 0.010897762142121792, "timestamp": "2025-10-01 04:11:53.405601", "step": 736, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.445501", "step": 736, "epoch": 1 }, { "type": "loss", "content": 0.02856982871890068, "timestamp": "2025-10-01 04:11:53.454843", "step": 737, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.496780", "step": 737, "epoch": 1 }, { "type": "loss", "content": 0.014090476557612419, "timestamp": "2025-10-01 04:11:53.500010", "step": 738, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.563469", "step": 738, "epoch": 1 }, { "type": "loss", "content": 0.022237354889512062, "timestamp": "2025-10-01 04:11:53.573453", "step": 739, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:53.627816", "step": 739, "epoch": 1 }, { "type": "loss", "content": 0.0287479180842638, "timestamp": "2025-10-01 04:11:53.661257", "step": 740, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:53.714096", "step": 740, "epoch": 1 }, { "type": "loss", "content": 0.028681013733148575, "timestamp": "2025-10-01 04:11:53.724233", "step": 741, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:54.833457", "step": 741, "epoch": 1 }, { "type": "pplx", "content": 108961684.90475054, "timestamp": "2025-10-01 04:11:54.839330", "step": 741, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:54.872578", "step": 741, "epoch": 1 }, { "type": "loss", "content": 0.013316790573298931, "timestamp": "2025-10-01 04:11:54.875134", "step": 742, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:54.914749", "step": 742, "epoch": 1 }, { "type": "loss", "content": 0.015716271474957466, "timestamp": "2025-10-01 04:11:54.923268", "step": 743, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:54.965364", "step": 743, "epoch": 1 }, { "type": "loss", "content": 0.03807631880044937, "timestamp": "2025-10-01 04:11:54.990275", "step": 744, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.026831", "step": 744, "epoch": 1 }, { "type": "loss", "content": 0.014975814148783684, "timestamp": "2025-10-01 04:11:55.028965", "step": 745, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.065030", "step": 745, "epoch": 1 }, { "type": "loss", "content": 0.015280152671039104, "timestamp": "2025-10-01 04:11:55.068313", "step": 746, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.101661", "step": 746, "epoch": 1 }, { "type": "loss", "content": 0.012907147407531738, "timestamp": "2025-10-01 04:11:55.103929", "step": 747, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.138170", "step": 747, "epoch": 1 }, { "type": "loss", "content": 0.029170582070946693, "timestamp": "2025-10-01 04:11:55.162265", "step": 748, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.194297", "step": 748, "epoch": 1 }, { "type": "loss", "content": 0.023870984092354774, "timestamp": "2025-10-01 04:11:55.197224", "step": 749, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.230218", "step": 749, "epoch": 1 }, { "type": "loss", "content": 0.019887370988726616, "timestamp": "2025-10-01 04:11:55.233192", "step": 750, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.266420", "step": 750, "epoch": 1 }, { "type": "loss", "content": 0.021655777469277382, "timestamp": "2025-10-01 04:11:55.269631", "step": 751, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.302756", "step": 751, "epoch": 1 }, { "type": "loss", "content": 0.03200121968984604, "timestamp": "2025-10-01 04:11:55.326805", "step": 752, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.362029", "step": 752, "epoch": 1 }, { "type": "loss", "content": 0.02003059722483158, "timestamp": "2025-10-01 04:11:55.367595", "step": 753, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:55.401802", "step": 753, "epoch": 1 }, { "type": "loss", "content": 0.0149669349193573, "timestamp": "2025-10-01 04:11:55.414367", "step": 754, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:55.458123", "step": 754, "epoch": 1 }, { "type": "loss", "content": 0.018943343311548233, "timestamp": "2025-10-01 04:11:55.463901", "step": 755, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.507977", "step": 755, "epoch": 1 }, { "type": "loss", "content": 0.02300729975104332, "timestamp": "2025-10-01 04:11:55.536691", "step": 756, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.583594", "step": 756, "epoch": 1 }, { "type": "loss", "content": 0.006794582586735487, "timestamp": "2025-10-01 04:11:55.592780", "step": 757, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.638867", "step": 757, "epoch": 1 }, { "type": "loss", "content": 0.007976575754582882, "timestamp": "2025-10-01 04:11:55.647286", "step": 758, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:55.700965", "step": 758, "epoch": 1 }, { "type": "loss", "content": 0.011165034957230091, "timestamp": "2025-10-01 04:11:55.709016", "step": 759, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.749078", "step": 759, "epoch": 1 }, { "type": "loss", "content": 0.014530644752085209, "timestamp": "2025-10-01 04:11:55.778360", "step": 760, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.814274", "step": 760, "epoch": 1 }, { "type": "loss", "content": 0.00375539087690413, "timestamp": "2025-10-01 04:11:55.817577", "step": 761, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:11:55.858699", "step": 761, "epoch": 1 }, { "type": "loss", "content": 0.006393683608621359, "timestamp": "2025-10-01 04:11:55.861649", "step": 762, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.897046", "step": 762, "epoch": 1 }, { "type": "loss", "content": 0.04293840005993843, "timestamp": "2025-10-01 04:11:55.899758", "step": 763, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.934308", "step": 763, "epoch": 1 }, { "type": "loss", "content": 0.01398535817861557, "timestamp": "2025-10-01 04:11:55.959440", "step": 764, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:55.994567", "step": 764, "epoch": 1 }, { "type": "loss", "content": 0.00975012220442295, "timestamp": "2025-10-01 04:11:55.999939", "step": 765, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.035453", "step": 765, "epoch": 1 }, { "type": "loss", "content": 0.020558195188641548, "timestamp": "2025-10-01 04:11:56.040047", "step": 766, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.076561", "step": 766, "epoch": 1 }, { "type": "loss", "content": 0.010339805856347084, "timestamp": "2025-10-01 04:11:56.079922", "step": 767, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.117425", "step": 767, "epoch": 1 }, { "type": "loss", "content": 0.05061133950948715, "timestamp": "2025-10-01 04:11:56.146045", "step": 768, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.190804", "step": 768, "epoch": 1 }, { "type": "loss", "content": 0.016301441937685013, "timestamp": "2025-10-01 04:11:56.194700", "step": 769, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.236612", "step": 769, "epoch": 1 }, { "type": "loss", "content": 0.030423134565353394, "timestamp": "2025-10-01 04:11:56.246395", "step": 770, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:56.299963", "step": 770, "epoch": 1 }, { "type": "loss", "content": 0.027092410251498222, "timestamp": "2025-10-01 04:11:56.309971", "step": 771, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:56.369486", "step": 771, "epoch": 1 }, { "type": "loss", "content": 0.004554868675768375, "timestamp": "2025-10-01 04:11:56.393571", "step": 772, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.445387", "step": 772, "epoch": 1 }, { "type": "loss", "content": 0.05927324295043945, "timestamp": "2025-10-01 04:11:56.454927", "step": 773, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.503918", "step": 773, "epoch": 1 }, { "type": "loss", "content": 0.003543505212292075, "timestamp": "2025-10-01 04:11:56.511457", "step": 774, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:56.556983", "step": 774, "epoch": 1 }, { "type": "loss", "content": 0.03816291317343712, "timestamp": "2025-10-01 04:11:56.562564", "step": 775, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:56.598733", "step": 775, "epoch": 1 }, { "type": "loss", "content": 0.03273823857307434, "timestamp": "2025-10-01 04:11:56.627274", "step": 776, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:56.671804", "step": 776, "epoch": 1 }, { "type": "loss", "content": 0.004695338662713766, "timestamp": "2025-10-01 04:11:56.675230", "step": 777, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.728301", "step": 777, "epoch": 1 }, { "type": "loss", "content": 0.002485628006979823, "timestamp": "2025-10-01 04:11:56.738272", "step": 778, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:56.783631", "step": 778, "epoch": 1 }, { "type": "loss", "content": 0.027432991191744804, "timestamp": "2025-10-01 04:11:56.786230", "step": 779, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.828549", "step": 779, "epoch": 1 }, { "type": "loss", "content": 0.007833785377442837, "timestamp": "2025-10-01 04:11:56.858782", "step": 780, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.906567", "step": 780, "epoch": 1 }, { "type": "loss", "content": 0.025769120082259178, "timestamp": "2025-10-01 04:11:56.916001", "step": 781, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:56.955938", "step": 781, "epoch": 1 }, { "type": "loss", "content": 0.02049829252064228, "timestamp": "2025-10-01 04:11:56.960001", "step": 782, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:56.997915", "step": 782, "epoch": 1 }, { "type": "loss", "content": 0.03482077643275261, "timestamp": "2025-10-01 04:11:57.003600", "step": 783, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:57.039411", "step": 783, "epoch": 1 }, { "type": "loss", "content": 0.014167026616632938, "timestamp": "2025-10-01 04:11:57.063203", "step": 784, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.108893", "step": 784, "epoch": 1 }, { "type": "loss", "content": 0.008381464518606663, "timestamp": "2025-10-01 04:11:57.116584", "step": 785, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:57.167527", "step": 785, "epoch": 1 }, { "type": "loss", "content": 0.005195831414312124, "timestamp": "2025-10-01 04:11:57.174777", "step": 786, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.232737", "step": 786, "epoch": 1 }, { "type": "loss", "content": 0.0064064571633934975, "timestamp": "2025-10-01 04:11:57.237699", "step": 787, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.291393", "step": 787, "epoch": 1 }, { "type": "loss", "content": 0.025258291512727737, "timestamp": "2025-10-01 04:11:57.322812", "step": 788, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.378250", "step": 788, "epoch": 1 }, { "type": "loss", "content": 0.011926391161978245, "timestamp": "2025-10-01 04:11:57.385280", "step": 789, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:57.431924", "step": 789, "epoch": 1 }, { "type": "loss", "content": 0.006661646068096161, "timestamp": "2025-10-01 04:11:57.434666", "step": 790, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.479799", "step": 790, "epoch": 1 }, { "type": "loss", "content": 0.01632728986442089, "timestamp": "2025-10-01 04:11:57.483261", "step": 791, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.532253", "step": 791, "epoch": 1 }, { "type": "loss", "content": 0.02356698177754879, "timestamp": "2025-10-01 04:11:57.559952", "step": 792, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.600552", "step": 792, "epoch": 1 }, { "type": "loss", "content": 0.015473469160497189, "timestamp": "2025-10-01 04:11:57.607668", "step": 793, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.650351", "step": 793, "epoch": 1 }, { "type": "loss", "content": 0.034613873809576035, "timestamp": "2025-10-01 04:11:57.656551", "step": 794, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.697746", "step": 794, "epoch": 1 }, { "type": "loss", "content": 0.014318762347102165, "timestamp": "2025-10-01 04:11:57.703561", "step": 795, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.744198", "step": 795, "epoch": 1 }, { "type": "loss", "content": 0.008876429870724678, "timestamp": "2025-10-01 04:11:57.771183", "step": 796, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:57.813699", "step": 796, "epoch": 1 }, { "type": "loss", "content": 0.012135490775108337, "timestamp": "2025-10-01 04:11:57.816535", "step": 797, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:57.857964", "step": 797, "epoch": 1 }, { "type": "loss", "content": 0.017030585557222366, "timestamp": "2025-10-01 04:11:57.863410", "step": 798, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:11:59.172396", "step": 798, "epoch": 1 }, { "type": "pplx", "content": 102363157.80185008, "timestamp": "2025-10-01 04:11:59.178535", "step": 798, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.218950", "step": 798, "epoch": 1 }, { "type": "loss", "content": 0.008845179341733456, "timestamp": "2025-10-01 04:11:59.225271", "step": 799, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.265737", "step": 799, "epoch": 1 }, { "type": "loss", "content": 0.036368466913700104, "timestamp": "2025-10-01 04:11:59.295196", "step": 800, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.337808", "step": 800, "epoch": 1 }, { "type": "loss", "content": 0.025295143947005272, "timestamp": "2025-10-01 04:11:59.343996", "step": 801, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:59.383178", "step": 801, "epoch": 1 }, { "type": "loss", "content": 0.027338379994034767, "timestamp": "2025-10-01 04:11:59.386776", "step": 802, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.421196", "step": 802, "epoch": 1 }, { "type": "loss", "content": 0.024521732702851295, "timestamp": "2025-10-01 04:11:59.424105", "step": 803, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.458420", "step": 803, "epoch": 1 }, { "type": "loss", "content": 0.012448400259017944, "timestamp": "2025-10-01 04:11:59.482844", "step": 804, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.516700", "step": 804, "epoch": 1 }, { "type": "loss", "content": 0.02401571348309517, "timestamp": "2025-10-01 04:11:59.520582", "step": 805, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.557363", "step": 805, "epoch": 1 }, { "type": "loss", "content": 0.023896140977740288, "timestamp": "2025-10-01 04:11:59.560563", "step": 806, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.598301", "step": 806, "epoch": 1 }, { "type": "loss", "content": 0.013595336116850376, "timestamp": "2025-10-01 04:11:59.603392", "step": 807, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:59.642625", "step": 807, "epoch": 1 }, { "type": "loss", "content": 0.019209247082471848, "timestamp": "2025-10-01 04:11:59.667810", "step": 808, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:59.703976", "step": 808, "epoch": 1 }, { "type": "loss", "content": 0.01122989971190691, "timestamp": "2025-10-01 04:11:59.707556", "step": 809, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:11:59.741656", "step": 809, "epoch": 1 }, { "type": "loss", "content": 0.015157909132540226, "timestamp": "2025-10-01 04:11:59.747150", "step": 810, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.786437", "step": 810, "epoch": 1 }, { "type": "loss", "content": 0.023954039439558983, "timestamp": "2025-10-01 04:11:59.792755", "step": 811, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.834677", "step": 811, "epoch": 1 }, { "type": "loss", "content": 0.01369868777692318, "timestamp": "2025-10-01 04:11:59.861648", "step": 812, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:11:59.902049", "step": 812, "epoch": 1 }, { "type": "loss", "content": 0.021969353780150414, "timestamp": "2025-10-01 04:11:59.905681", "step": 813, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:11:59.949346", "step": 813, "epoch": 1 }, { "type": "loss", "content": 0.015957282856106758, "timestamp": "2025-10-01 04:11:59.959602", "step": 814, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.017177", "step": 814, "epoch": 1 }, { "type": "loss", "content": 0.010398841463029385, "timestamp": "2025-10-01 04:12:00.025020", "step": 815, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.076324", "step": 815, "epoch": 1 }, { "type": "loss", "content": 0.017073335126042366, "timestamp": "2025-10-01 04:12:00.107522", "step": 816, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.155939", "step": 816, "epoch": 1 }, { "type": "loss", "content": 0.03205118328332901, "timestamp": "2025-10-01 04:12:00.166664", "step": 817, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.208485", "step": 817, "epoch": 1 }, { "type": "loss", "content": 0.018138429149985313, "timestamp": "2025-10-01 04:12:00.211567", "step": 818, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.257646", "step": 818, "epoch": 1 }, { "type": "loss", "content": 0.010016137734055519, "timestamp": "2025-10-01 04:12:00.261962", "step": 819, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.313960", "step": 819, "epoch": 1 }, { "type": "loss", "content": 0.010501930490136147, "timestamp": "2025-10-01 04:12:00.343409", "step": 820, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:00.396914", "step": 820, "epoch": 1 }, { "type": "loss", "content": 0.012027454562485218, "timestamp": "2025-10-01 04:12:00.409559", "step": 821, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.469584", "step": 821, "epoch": 1 }, { "type": "loss", "content": 0.017790386453270912, "timestamp": "2025-10-01 04:12:00.480896", "step": 822, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.538976", "step": 822, "epoch": 1 }, { "type": "loss", "content": 0.014075555838644505, "timestamp": "2025-10-01 04:12:00.551299", "step": 823, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:00.602096", "step": 823, "epoch": 1 }, { "type": "loss", "content": 0.02951137162744999, "timestamp": "2025-10-01 04:12:00.635264", "step": 824, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:00.691076", "step": 824, "epoch": 1 }, { "type": "loss", "content": 0.01087740994989872, "timestamp": "2025-10-01 04:12:00.697185", "step": 825, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.751038", "step": 825, "epoch": 1 }, { "type": "loss", "content": 0.015133983455598354, "timestamp": "2025-10-01 04:12:00.757248", "step": 826, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.801679", "step": 826, "epoch": 1 }, { "type": "loss", "content": 0.03115757927298546, "timestamp": "2025-10-01 04:12:00.803949", "step": 827, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:00.849409", "step": 827, "epoch": 1 }, { "type": "loss", "content": 0.012817888520658016, "timestamp": "2025-10-01 04:12:00.878164", "step": 828, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:00.933169", "step": 828, "epoch": 1 }, { "type": "loss", "content": 0.01865779235959053, "timestamp": "2025-10-01 04:12:00.942699", "step": 829, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.000681", "step": 829, "epoch": 1 }, { "type": "loss", "content": 0.005558286793529987, "timestamp": "2025-10-01 04:12:01.005026", "step": 830, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.065180", "step": 830, "epoch": 1 }, { "type": "loss", "content": 0.007820365950465202, "timestamp": "2025-10-01 04:12:01.078400", "step": 831, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.140996", "step": 831, "epoch": 1 }, { "type": "loss", "content": 0.006412907503545284, "timestamp": "2025-10-01 04:12:01.173058", "step": 832, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.218904", "step": 832, "epoch": 1 }, { "type": "loss", "content": 0.028616532683372498, "timestamp": "2025-10-01 04:12:01.225155", "step": 833, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:01.266444", "step": 833, "epoch": 1 }, { "type": "loss", "content": 0.013076567091047764, "timestamp": "2025-10-01 04:12:01.273313", "step": 834, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:01.316739", "step": 834, "epoch": 1 }, { "type": "loss", "content": 0.00448998948559165, "timestamp": "2025-10-01 04:12:01.318783", "step": 835, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.359621", "step": 835, "epoch": 1 }, { "type": "loss", "content": 0.007429972290992737, "timestamp": "2025-10-01 04:12:01.386693", "step": 836, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:01.431201", "step": 836, "epoch": 1 }, { "type": "loss", "content": 0.034416042268276215, "timestamp": "2025-10-01 04:12:01.439471", "step": 837, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.487945", "step": 837, "epoch": 1 }, { "type": "loss", "content": 0.015252464450895786, "timestamp": "2025-10-01 04:12:01.497006", "step": 838, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.553566", "step": 838, "epoch": 1 }, { "type": "loss", "content": 0.030947471037507057, "timestamp": "2025-10-01 04:12:01.561050", "step": 839, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:01.607202", "step": 839, "epoch": 1 }, { "type": "loss", "content": 0.01654486544430256, "timestamp": "2025-10-01 04:12:01.632622", "step": 840, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.682363", "step": 840, "epoch": 1 }, { "type": "loss", "content": 0.01720511168241501, "timestamp": "2025-10-01 04:12:01.688159", "step": 841, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.728450", "step": 841, "epoch": 1 }, { "type": "loss", "content": 0.031022807583212852, "timestamp": "2025-10-01 04:12:01.732792", "step": 842, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.769216", "step": 842, "epoch": 1 }, { "type": "loss", "content": 0.01400839351117611, "timestamp": "2025-10-01 04:12:01.775955", "step": 843, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:01.824158", "step": 843, "epoch": 1 }, { "type": "loss", "content": 0.017199808731675148, "timestamp": "2025-10-01 04:12:01.848412", "step": 844, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:01.906886", "step": 844, "epoch": 1 }, { "type": "loss", "content": 0.0067504760809242725, "timestamp": "2025-10-01 04:12:01.915826", "step": 845, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:01.970429", "step": 845, "epoch": 1 }, { "type": "loss", "content": 0.0025089120026677847, "timestamp": "2025-10-01 04:12:01.981948", "step": 846, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:02.040140", "step": 846, "epoch": 1 }, { "type": "loss", "content": 0.030881991609930992, "timestamp": "2025-10-01 04:12:02.051862", "step": 847, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:02.112878", "step": 847, "epoch": 1 }, { "type": "loss", "content": 0.01905219629406929, "timestamp": "2025-10-01 04:12:02.138776", "step": 848, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:02.194945", "step": 848, "epoch": 1 }, { "type": "loss", "content": 0.005448561627417803, "timestamp": "2025-10-01 04:12:02.202879", "step": 849, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:02.250476", "step": 849, "epoch": 1 }, { "type": "loss", "content": 0.003623712807893753, "timestamp": "2025-10-01 04:12:02.254248", "step": 850, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:02.303264", "step": 850, "epoch": 1 }, { "type": "loss", "content": 0.0068411752581596375, "timestamp": "2025-10-01 04:12:02.308834", "step": 851, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:02.350799", "step": 851, "epoch": 1 }, { "type": "loss", "content": 0.020454945042729378, "timestamp": "2025-10-01 04:12:02.383078", "step": 852, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:02.422018", "step": 852, "epoch": 1 }, { "type": "loss", "content": 0.006170716602355242, "timestamp": "2025-10-01 04:12:02.428049", "step": 853, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:02.481776", "step": 853, "epoch": 1 }, { "type": "loss", "content": 0.022493934258818626, "timestamp": "2025-10-01 04:12:02.486925", "step": 854, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:02.522094", "step": 854, "epoch": 1 }, { "type": "loss", "content": 0.0045220511965453625, "timestamp": "2025-10-01 04:12:02.526650", "step": 855, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:03.994172", "step": 855, "epoch": 1 }, { "type": "pplx", "content": 107042227.57520783, "timestamp": "2025-10-01 04:12:04.004953", "step": 855, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.056939", "step": 855, "epoch": 1 }, { "type": "loss", "content": 0.0042382897809147835, "timestamp": "2025-10-01 04:12:04.089067", "step": 856, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.165021", "step": 856, "epoch": 1 }, { "type": "loss", "content": 0.014735554344952106, "timestamp": "2025-10-01 04:12:04.175888", "step": 857, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:04.226904", "step": 857, "epoch": 1 }, { "type": "loss", "content": 0.011172778904438019, "timestamp": "2025-10-01 04:12:04.236757", "step": 858, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.291464", "step": 858, "epoch": 1 }, { "type": "loss", "content": 0.013173661194741726, "timestamp": "2025-10-01 04:12:04.299580", "step": 859, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.351204", "step": 859, "epoch": 1 }, { "type": "loss", "content": 0.027727048844099045, "timestamp": "2025-10-01 04:12:04.382371", "step": 860, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.434095", "step": 860, "epoch": 1 }, { "type": "loss", "content": 0.017057133838534355, "timestamp": "2025-10-01 04:12:04.443826", "step": 861, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:04.495787", "step": 861, "epoch": 1 }, { "type": "loss", "content": 0.007997279986739159, "timestamp": "2025-10-01 04:12:04.506394", "step": 862, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.561733", "step": 862, "epoch": 1 }, { "type": "loss", "content": 0.01755506731569767, "timestamp": "2025-10-01 04:12:04.569684", "step": 863, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.625051", "step": 863, "epoch": 1 }, { "type": "loss", "content": 0.013182953000068665, "timestamp": "2025-10-01 04:12:04.649663", "step": 864, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.698884", "step": 864, "epoch": 1 }, { "type": "loss", "content": 0.03394387289881706, "timestamp": "2025-10-01 04:12:04.708783", "step": 865, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:04.756031", "step": 865, "epoch": 1 }, { "type": "loss", "content": 0.009329610504209995, "timestamp": "2025-10-01 04:12:04.761469", "step": 866, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.802913", "step": 866, "epoch": 1 }, { "type": "loss", "content": 0.004415162838995457, "timestamp": "2025-10-01 04:12:04.809347", "step": 867, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.847986", "step": 867, "epoch": 1 }, { "type": "loss", "content": 0.014574938453733921, "timestamp": "2025-10-01 04:12:04.876622", "step": 868, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:04.923842", "step": 868, "epoch": 1 }, { "type": "loss", "content": 0.01505463756620884, "timestamp": "2025-10-01 04:12:04.926431", "step": 869, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:04.971915", "step": 869, "epoch": 1 }, { "type": "loss", "content": 0.03582816570997238, "timestamp": "2025-10-01 04:12:04.981438", "step": 870, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.022620", "step": 870, "epoch": 1 }, { "type": "loss", "content": 0.002920368453487754, "timestamp": "2025-10-01 04:12:05.025098", "step": 871, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.070777", "step": 871, "epoch": 1 }, { "type": "loss", "content": 0.002681077690795064, "timestamp": "2025-10-01 04:12:05.098583", "step": 872, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.145298", "step": 872, "epoch": 1 }, { "type": "loss", "content": 0.010480071417987347, "timestamp": "2025-10-01 04:12:05.153231", "step": 873, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.203275", "step": 873, "epoch": 1 }, { "type": "loss", "content": 0.00535059766843915, "timestamp": "2025-10-01 04:12:05.210753", "step": 874, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.251607", "step": 874, "epoch": 1 }, { "type": "loss", "content": 0.02054414339363575, "timestamp": "2025-10-01 04:12:05.256635", "step": 875, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.293195", "step": 875, "epoch": 1 }, { "type": "loss", "content": 0.029474778100848198, "timestamp": "2025-10-01 04:12:05.318441", "step": 876, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:05.358847", "step": 876, "epoch": 1 }, { "type": "loss", "content": 0.006558016873896122, "timestamp": "2025-10-01 04:12:05.366568", "step": 877, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.400568", "step": 877, "epoch": 1 }, { "type": "loss", "content": 0.0020208796486258507, "timestamp": "2025-10-01 04:12:05.409495", "step": 878, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.461083", "step": 878, "epoch": 1 }, { "type": "loss", "content": 0.030072884634137154, "timestamp": "2025-10-01 04:12:05.466107", "step": 879, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:05.522738", "step": 879, "epoch": 1 }, { "type": "loss", "content": 0.04798530787229538, "timestamp": "2025-10-01 04:12:05.547105", "step": 880, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.595769", "step": 880, "epoch": 1 }, { "type": "loss", "content": 0.02707165852189064, "timestamp": "2025-10-01 04:12:05.598633", "step": 881, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.643281", "step": 881, "epoch": 1 }, { "type": "loss", "content": 0.04058273881673813, "timestamp": "2025-10-01 04:12:05.650065", "step": 882, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.706593", "step": 882, "epoch": 1 }, { "type": "loss", "content": 0.03420673683285713, "timestamp": "2025-10-01 04:12:05.717500", "step": 883, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:05.767365", "step": 883, "epoch": 1 }, { "type": "loss", "content": 0.0023017581552267075, "timestamp": "2025-10-01 04:12:05.792872", "step": 884, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.837273", "step": 884, "epoch": 1 }, { "type": "loss", "content": 0.00631122337654233, "timestamp": "2025-10-01 04:12:05.848497", "step": 885, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.892020", "step": 885, "epoch": 1 }, { "type": "loss", "content": 0.02051430568099022, "timestamp": "2025-10-01 04:12:05.902489", "step": 886, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:05.950386", "step": 886, "epoch": 1 }, { "type": "loss", "content": 0.008391078561544418, "timestamp": "2025-10-01 04:12:05.962579", "step": 887, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:05.999759", "step": 887, "epoch": 1 }, { "type": "loss", "content": 0.02948450855910778, "timestamp": "2025-10-01 04:12:06.033511", "step": 888, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.092382", "step": 888, "epoch": 1 }, { "type": "loss", "content": 0.017128828912973404, "timestamp": "2025-10-01 04:12:06.103309", "step": 889, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.169790", "step": 889, "epoch": 1 }, { "type": "loss", "content": 0.030220722779631615, "timestamp": "2025-10-01 04:12:06.176744", "step": 890, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.224499", "step": 890, "epoch": 1 }, { "type": "loss", "content": 0.022421736270189285, "timestamp": "2025-10-01 04:12:06.235003", "step": 891, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:06.288345", "step": 891, "epoch": 1 }, { "type": "loss", "content": 0.015720544382929802, "timestamp": "2025-10-01 04:12:06.313720", "step": 892, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:06.361819", "step": 892, "epoch": 1 }, { "type": "loss", "content": 0.01451858039945364, "timestamp": "2025-10-01 04:12:06.367119", "step": 893, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.405891", "step": 893, "epoch": 1 }, { "type": "loss", "content": 0.04697972536087036, "timestamp": "2025-10-01 04:12:06.411375", "step": 894, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.454651", "step": 894, "epoch": 1 }, { "type": "loss", "content": 0.021802568808197975, "timestamp": "2025-10-01 04:12:06.459383", "step": 895, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.502047", "step": 895, "epoch": 1 }, { "type": "loss", "content": 0.021685032173991203, "timestamp": "2025-10-01 04:12:06.528635", "step": 896, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.571771", "step": 896, "epoch": 1 }, { "type": "loss", "content": 0.0264862310141325, "timestamp": "2025-10-01 04:12:06.575865", "step": 897, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:06.613185", "step": 897, "epoch": 1 }, { "type": "loss", "content": 0.014706993475556374, "timestamp": "2025-10-01 04:12:06.618410", "step": 898, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:06.656039", "step": 898, "epoch": 1 }, { "type": "loss", "content": 0.010481654666364193, "timestamp": "2025-10-01 04:12:06.660860", "step": 899, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:06.702758", "step": 899, "epoch": 1 }, { "type": "loss", "content": 0.013229523785412312, "timestamp": "2025-10-01 04:12:06.729066", "step": 900, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.768662", "step": 900, "epoch": 1 }, { "type": "loss", "content": 0.026366079226136208, "timestamp": "2025-10-01 04:12:06.771913", "step": 901, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.809447", "step": 901, "epoch": 1 }, { "type": "loss", "content": 0.021769892424345016, "timestamp": "2025-10-01 04:12:06.813207", "step": 902, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.857845", "step": 902, "epoch": 1 }, { "type": "loss", "content": 0.007020240183919668, "timestamp": "2025-10-01 04:12:06.860778", "step": 903, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.903744", "step": 903, "epoch": 1 }, { "type": "loss", "content": 0.035886842757463455, "timestamp": "2025-10-01 04:12:06.933546", "step": 904, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:06.981239", "step": 904, "epoch": 1 }, { "type": "loss", "content": 0.012589776888489723, "timestamp": "2025-10-01 04:12:06.984791", "step": 905, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:07.023171", "step": 905, "epoch": 1 }, { "type": "loss", "content": 0.009022383950650692, "timestamp": "2025-10-01 04:12:07.026527", "step": 906, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:07.067039", "step": 906, "epoch": 1 }, { "type": "loss", "content": 0.03269534558057785, "timestamp": "2025-10-01 04:12:07.069709", "step": 907, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:07.106064", "step": 907, "epoch": 1 }, { "type": "loss", "content": 0.020773855969309807, "timestamp": "2025-10-01 04:12:07.132141", "step": 908, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:07.167996", "step": 908, "epoch": 1 }, { "type": "loss", "content": 0.015614612959325314, "timestamp": "2025-10-01 04:12:07.172464", "step": 909, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:07.211257", "step": 909, "epoch": 1 }, { "type": "loss", "content": 0.0035907472483813763, "timestamp": "2025-10-01 04:12:07.213758", "step": 910, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:07.252624", "step": 910, "epoch": 1 }, { "type": "loss", "content": 0.016620244830846786, "timestamp": "2025-10-01 04:12:07.260429", "step": 911, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:07.306415", "step": 911, "epoch": 1 }, { "type": "loss", "content": 0.005902507808059454, "timestamp": "2025-10-01 04:12:07.339644", "step": 912, "epoch": 1 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:08.515415", "step": 912, "epoch": 1 }, { "type": "pplx", "content": 96790900.33100773, "timestamp": "2025-10-01 04:12:08.518295", "step": 912, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:08.549571", "step": 912, "epoch": 1 }, { "type": "loss", "content": 0.044129107147455215, "timestamp": "2025-10-01 04:12:08.555513", "step": 913, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:08.602448", "step": 913, "epoch": 1 }, { "type": "loss", "content": 0.010207407176494598, "timestamp": "2025-10-01 04:12:08.609138", "step": 914, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:08.651030", "step": 914, "epoch": 1 }, { "type": "loss", "content": 0.006177072878926992, "timestamp": "2025-10-01 04:12:08.654898", "step": 915, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:08.690472", "step": 915, "epoch": 1 }, { "type": "loss", "content": 0.0061223325319588184, "timestamp": "2025-10-01 04:12:08.715821", "step": 916, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:08.761149", "step": 916, "epoch": 1 }, { "type": "loss", "content": 0.03369773551821709, "timestamp": "2025-10-01 04:12:08.769894", "step": 917, "epoch": 1 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:08.831986", "step": 917, "epoch": 2 }, { "type": "loss", "content": 0.04762774705886841, "timestamp": "2025-10-01 04:12:08.837318", "step": 918, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:08.880719", "step": 918, "epoch": 2 }, { "type": "loss", "content": 0.018315879628062248, "timestamp": "2025-10-01 04:12:08.891227", "step": 919, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:08.960991", "step": 919, "epoch": 2 }, { "type": "loss", "content": 0.0561673678457737, "timestamp": "2025-10-01 04:12:08.993133", "step": 920, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.059612", "step": 920, "epoch": 2 }, { "type": "loss", "content": 0.015595680102705956, "timestamp": "2025-10-01 04:12:09.074119", "step": 921, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.136361", "step": 921, "epoch": 2 }, { "type": "loss", "content": 0.0501207634806633, "timestamp": "2025-10-01 04:12:09.149929", "step": 922, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.217652", "step": 922, "epoch": 2 }, { "type": "loss", "content": 0.02210090495646, "timestamp": "2025-10-01 04:12:09.221052", "step": 923, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:09.287013", "step": 923, "epoch": 2 }, { "type": "loss", "content": 0.04293939843773842, "timestamp": "2025-10-01 04:12:09.318752", "step": 924, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.365635", "step": 924, "epoch": 2 }, { "type": "loss", "content": 0.0104265958070755, "timestamp": "2025-10-01 04:12:09.368892", "step": 925, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.408790", "step": 925, "epoch": 2 }, { "type": "loss", "content": 0.00624179607257247, "timestamp": "2025-10-01 04:12:09.415409", "step": 926, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.465406", "step": 926, "epoch": 2 }, { "type": "loss", "content": 0.02589820884168148, "timestamp": "2025-10-01 04:12:09.472234", "step": 927, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.513580", "step": 927, "epoch": 2 }, { "type": "loss", "content": 0.020178841426968575, "timestamp": "2025-10-01 04:12:09.540253", "step": 928, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.579214", "step": 928, "epoch": 2 }, { "type": "loss", "content": 0.01020850706845522, "timestamp": "2025-10-01 04:12:09.585758", "step": 929, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.624660", "step": 929, "epoch": 2 }, { "type": "loss", "content": 0.015341407619416714, "timestamp": "2025-10-01 04:12:09.629545", "step": 930, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.667253", "step": 930, "epoch": 2 }, { "type": "loss", "content": 0.021973326802253723, "timestamp": "2025-10-01 04:12:09.671565", "step": 931, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.712506", "step": 931, "epoch": 2 }, { "type": "loss", "content": 0.025720873847603798, "timestamp": "2025-10-01 04:12:09.740189", "step": 932, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.778019", "step": 932, "epoch": 2 }, { "type": "loss", "content": 0.018589098006486893, "timestamp": "2025-10-01 04:12:09.785840", "step": 933, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.825380", "step": 933, "epoch": 2 }, { "type": "loss", "content": 0.025446845218539238, "timestamp": "2025-10-01 04:12:09.830555", "step": 934, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.878671", "step": 934, "epoch": 2 }, { "type": "loss", "content": 0.014553597196936607, "timestamp": "2025-10-01 04:12:09.882659", "step": 935, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:09.930777", "step": 935, "epoch": 2 }, { "type": "loss", "content": 0.021716592833399773, "timestamp": "2025-10-01 04:12:09.955352", "step": 936, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:09.999942", "step": 936, "epoch": 2 }, { "type": "loss", "content": 0.01763700507581234, "timestamp": "2025-10-01 04:12:10.005921", "step": 937, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:10.063041", "step": 937, "epoch": 2 }, { "type": "loss", "content": 0.020768333226442337, "timestamp": "2025-10-01 04:12:10.069442", "step": 938, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.110045", "step": 938, "epoch": 2 }, { "type": "loss", "content": 0.02879549190402031, "timestamp": "2025-10-01 04:12:10.115375", "step": 939, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.163679", "step": 939, "epoch": 2 }, { "type": "loss", "content": 0.018444154411554337, "timestamp": "2025-10-01 04:12:10.191711", "step": 940, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.229687", "step": 940, "epoch": 2 }, { "type": "loss", "content": 0.024395182728767395, "timestamp": "2025-10-01 04:12:10.233015", "step": 941, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.269980", "step": 941, "epoch": 2 }, { "type": "loss", "content": 0.01769259199500084, "timestamp": "2025-10-01 04:12:10.276898", "step": 942, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:10.321030", "step": 942, "epoch": 2 }, { "type": "loss", "content": 0.01453376840800047, "timestamp": "2025-10-01 04:12:10.325942", "step": 943, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:10.367664", "step": 943, "epoch": 2 }, { "type": "loss", "content": 0.015906628221273422, "timestamp": "2025-10-01 04:12:10.396363", "step": 944, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.442167", "step": 944, "epoch": 2 }, { "type": "loss", "content": 0.02611907199025154, "timestamp": "2025-10-01 04:12:10.447021", "step": 945, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.487681", "step": 945, "epoch": 2 }, { "type": "loss", "content": 0.019764339551329613, "timestamp": "2025-10-01 04:12:10.497805", "step": 946, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.545114", "step": 946, "epoch": 2 }, { "type": "loss", "content": 0.019637709483504295, "timestamp": "2025-10-01 04:12:10.554892", "step": 947, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.607893", "step": 947, "epoch": 2 }, { "type": "loss", "content": 0.02475491352379322, "timestamp": "2025-10-01 04:12:10.634591", "step": 948, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.678263", "step": 948, "epoch": 2 }, { "type": "loss", "content": 0.021814698353409767, "timestamp": "2025-10-01 04:12:10.684638", "step": 949, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.723482", "step": 949, "epoch": 2 }, { "type": "loss", "content": 0.013068260625004768, "timestamp": "2025-10-01 04:12:10.727501", "step": 950, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.769434", "step": 950, "epoch": 2 }, { "type": "loss", "content": 0.028850361704826355, "timestamp": "2025-10-01 04:12:10.775430", "step": 951, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.813547", "step": 951, "epoch": 2 }, { "type": "loss", "content": 0.012638353742659092, "timestamp": "2025-10-01 04:12:10.838855", "step": 952, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.873276", "step": 952, "epoch": 2 }, { "type": "loss", "content": 0.018763437867164612, "timestamp": "2025-10-01 04:12:10.878425", "step": 953, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.920391", "step": 953, "epoch": 2 }, { "type": "loss", "content": 0.01879069395363331, "timestamp": "2025-10-01 04:12:10.925301", "step": 954, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:10.963130", "step": 954, "epoch": 2 }, { "type": "loss", "content": 0.015433231368660927, "timestamp": "2025-10-01 04:12:10.966266", "step": 955, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:11.001795", "step": 955, "epoch": 2 }, { "type": "loss", "content": 0.010517173446714878, "timestamp": "2025-10-01 04:12:11.026780", "step": 956, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.061905", "step": 956, "epoch": 2 }, { "type": "loss", "content": 0.01133835967630148, "timestamp": "2025-10-01 04:12:11.066384", "step": 957, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.101600", "step": 957, "epoch": 2 }, { "type": "loss", "content": 0.006800402887165546, "timestamp": "2025-10-01 04:12:11.108591", "step": 958, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:11.147042", "step": 958, "epoch": 2 }, { "type": "loss", "content": 0.01833231933414936, "timestamp": "2025-10-01 04:12:11.150072", "step": 959, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.183028", "step": 959, "epoch": 2 }, { "type": "loss", "content": 0.019620051607489586, "timestamp": "2025-10-01 04:12:11.207303", "step": 960, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:11.241080", "step": 960, "epoch": 2 }, { "type": "loss", "content": 0.0022921969648450613, "timestamp": "2025-10-01 04:12:11.243564", "step": 961, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.275719", "step": 961, "epoch": 2 }, { "type": "loss", "content": 0.0034712564665824175, "timestamp": "2025-10-01 04:12:11.280361", "step": 962, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.314318", "step": 962, "epoch": 2 }, { "type": "loss", "content": 0.003425298258662224, "timestamp": "2025-10-01 04:12:11.319283", "step": 963, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.359994", "step": 963, "epoch": 2 }, { "type": "loss", "content": 0.021860791370272636, "timestamp": "2025-10-01 04:12:11.386160", "step": 964, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.418662", "step": 964, "epoch": 2 }, { "type": "loss", "content": 0.009410454891622066, "timestamp": "2025-10-01 04:12:11.421491", "step": 965, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.455087", "step": 965, "epoch": 2 }, { "type": "loss", "content": 0.0017230076482519507, "timestamp": "2025-10-01 04:12:11.465445", "step": 966, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:11.508021", "step": 966, "epoch": 2 }, { "type": "loss", "content": 0.030510956421494484, "timestamp": "2025-10-01 04:12:11.516776", "step": 967, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.579721", "step": 967, "epoch": 2 }, { "type": "loss", "content": 0.0029920325614511967, "timestamp": "2025-10-01 04:12:11.611391", "step": 968, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:11.666162", "step": 968, "epoch": 2 }, { "type": "loss", "content": 0.04396705701947212, "timestamp": "2025-10-01 04:12:11.676197", "step": 969, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:13.097572", "step": 969, "epoch": 2 }, { "type": "pplx", "content": 81072435.35475479, "timestamp": "2025-10-01 04:12:13.100930", "step": 969, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.134354", "step": 969, "epoch": 2 }, { "type": "loss", "content": 0.03889823704957962, "timestamp": "2025-10-01 04:12:13.136423", "step": 970, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:13.169704", "step": 970, "epoch": 2 }, { "type": "loss", "content": 0.03967985138297081, "timestamp": "2025-10-01 04:12:13.173045", "step": 971, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.206990", "step": 971, "epoch": 2 }, { "type": "loss", "content": 0.019627872854471207, "timestamp": "2025-10-01 04:12:13.230581", "step": 972, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.267976", "step": 972, "epoch": 2 }, { "type": "loss", "content": 0.03085772506892681, "timestamp": "2025-10-01 04:12:13.271186", "step": 973, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.304172", "step": 973, "epoch": 2 }, { "type": "loss", "content": 0.0168258436024189, "timestamp": "2025-10-01 04:12:13.310505", "step": 974, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.348723", "step": 974, "epoch": 2 }, { "type": "loss", "content": 0.018279071897268295, "timestamp": "2025-10-01 04:12:13.354676", "step": 975, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.396816", "step": 975, "epoch": 2 }, { "type": "loss", "content": 0.024387961253523827, "timestamp": "2025-10-01 04:12:13.425656", "step": 976, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.464578", "step": 976, "epoch": 2 }, { "type": "loss", "content": 0.029510188847780228, "timestamp": "2025-10-01 04:12:13.469101", "step": 977, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.504809", "step": 977, "epoch": 2 }, { "type": "loss", "content": 0.011254996992647648, "timestamp": "2025-10-01 04:12:13.512358", "step": 978, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.556342", "step": 978, "epoch": 2 }, { "type": "loss", "content": 0.02034107968211174, "timestamp": "2025-10-01 04:12:13.562588", "step": 979, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.607539", "step": 979, "epoch": 2 }, { "type": "loss", "content": 0.0030284065287560225, "timestamp": "2025-10-01 04:12:13.636953", "step": 980, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.676325", "step": 980, "epoch": 2 }, { "type": "loss", "content": 0.004054553806781769, "timestamp": "2025-10-01 04:12:13.681705", "step": 981, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.718878", "step": 981, "epoch": 2 }, { "type": "loss", "content": 0.0036088896449655294, "timestamp": "2025-10-01 04:12:13.723673", "step": 982, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.763124", "step": 982, "epoch": 2 }, { "type": "loss", "content": 0.03551328182220459, "timestamp": "2025-10-01 04:12:13.771368", "step": 983, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.815265", "step": 983, "epoch": 2 }, { "type": "loss", "content": 0.004522853996604681, "timestamp": "2025-10-01 04:12:13.841402", "step": 984, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.887015", "step": 984, "epoch": 2 }, { "type": "loss", "content": 0.022567346692085266, "timestamp": "2025-10-01 04:12:13.895150", "step": 985, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:13.936009", "step": 985, "epoch": 2 }, { "type": "loss", "content": 0.026564184576272964, "timestamp": "2025-10-01 04:12:13.939063", "step": 986, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:13.981348", "step": 986, "epoch": 2 }, { "type": "loss", "content": 0.02584756910800934, "timestamp": "2025-10-01 04:12:13.987136", "step": 987, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.033434", "step": 987, "epoch": 2 }, { "type": "loss", "content": 0.003964011557400227, "timestamp": "2025-10-01 04:12:14.057717", "step": 988, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.104433", "step": 988, "epoch": 2 }, { "type": "loss", "content": 0.03352031856775284, "timestamp": "2025-10-01 04:12:14.110344", "step": 989, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:14.153205", "step": 989, "epoch": 2 }, { "type": "loss", "content": 0.0071314736269414425, "timestamp": "2025-10-01 04:12:14.156565", "step": 990, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.189427", "step": 990, "epoch": 2 }, { "type": "loss", "content": 0.03919211030006409, "timestamp": "2025-10-01 04:12:14.193085", "step": 991, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.226823", "step": 991, "epoch": 2 }, { "type": "loss", "content": 0.018441015854477882, "timestamp": "2025-10-01 04:12:14.252171", "step": 992, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:14.286405", "step": 992, "epoch": 2 }, { "type": "loss", "content": 0.02514655515551567, "timestamp": "2025-10-01 04:12:14.290860", "step": 993, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.323829", "step": 993, "epoch": 2 }, { "type": "loss", "content": 0.022655008360743523, "timestamp": "2025-10-01 04:12:14.326870", "step": 994, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.359838", "step": 994, "epoch": 2 }, { "type": "loss", "content": 0.018373850733041763, "timestamp": "2025-10-01 04:12:14.362778", "step": 995, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.396282", "step": 995, "epoch": 2 }, { "type": "loss", "content": 0.02451673150062561, "timestamp": "2025-10-01 04:12:14.421074", "step": 996, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:14.464882", "step": 996, "epoch": 2 }, { "type": "loss", "content": 0.018129874020814896, "timestamp": "2025-10-01 04:12:14.472481", "step": 997, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.517635", "step": 997, "epoch": 2 }, { "type": "loss", "content": 0.026043349876999855, "timestamp": "2025-10-01 04:12:14.527495", "step": 998, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.571039", "step": 998, "epoch": 2 }, { "type": "loss", "content": 0.025865105912089348, "timestamp": "2025-10-01 04:12:14.578343", "step": 999, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:14.625556", "step": 999, "epoch": 2 }, { "type": "loss", "content": 0.006189211271703243, "timestamp": "2025-10-01 04:12:14.650728", "step": 1000, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1000", "timestamp": "2025-10-01 04:12:20.080288", "step": 1000, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.130469", "step": 1000, "epoch": 2 }, { "type": "loss", "content": 0.025496182963252068, "timestamp": "2025-10-01 04:12:20.139070", "step": 1001, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.186396", "step": 1001, "epoch": 2 }, { "type": "loss", "content": 0.019999513402581215, "timestamp": "2025-10-01 04:12:20.192583", "step": 1002, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.235175", "step": 1002, "epoch": 2 }, { "type": "loss", "content": 0.02922123670578003, "timestamp": "2025-10-01 04:12:20.242913", "step": 1003, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.290618", "step": 1003, "epoch": 2 }, { "type": "loss", "content": 0.011163772083818913, "timestamp": "2025-10-01 04:12:20.320274", "step": 1004, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.366077", "step": 1004, "epoch": 2 }, { "type": "loss", "content": 0.019625648856163025, "timestamp": "2025-10-01 04:12:20.373317", "step": 1005, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.426353", "step": 1005, "epoch": 2 }, { "type": "loss", "content": 0.026116380468010902, "timestamp": "2025-10-01 04:12:20.438592", "step": 1006, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:20.490040", "step": 1006, "epoch": 2 }, { "type": "loss", "content": 0.022429784759879112, "timestamp": "2025-10-01 04:12:20.503098", "step": 1007, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.550783", "step": 1007, "epoch": 2 }, { "type": "loss", "content": 0.03787512332201004, "timestamp": "2025-10-01 04:12:20.575550", "step": 1008, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:20.619300", "step": 1008, "epoch": 2 }, { "type": "loss", "content": 0.02521323598921299, "timestamp": "2025-10-01 04:12:20.625053", "step": 1009, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.668753", "step": 1009, "epoch": 2 }, { "type": "loss", "content": 0.01483877096325159, "timestamp": "2025-10-01 04:12:20.677295", "step": 1010, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.718640", "step": 1010, "epoch": 2 }, { "type": "loss", "content": 0.010802079923450947, "timestamp": "2025-10-01 04:12:20.730255", "step": 1011, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.781829", "step": 1011, "epoch": 2 }, { "type": "loss", "content": 0.02756008878350258, "timestamp": "2025-10-01 04:12:20.811183", "step": 1012, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.859332", "step": 1012, "epoch": 2 }, { "type": "loss", "content": 0.04122751206159592, "timestamp": "2025-10-01 04:12:20.872435", "step": 1013, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.925042", "step": 1013, "epoch": 2 }, { "type": "loss", "content": 0.021239006891846657, "timestamp": "2025-10-01 04:12:20.937739", "step": 1014, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:20.989925", "step": 1014, "epoch": 2 }, { "type": "loss", "content": 0.01687648706138134, "timestamp": "2025-10-01 04:12:21.001251", "step": 1015, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.062187", "step": 1015, "epoch": 2 }, { "type": "loss", "content": 0.00788965355604887, "timestamp": "2025-10-01 04:12:21.094134", "step": 1016, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.148712", "step": 1016, "epoch": 2 }, { "type": "loss", "content": 0.02206888608634472, "timestamp": "2025-10-01 04:12:21.152178", "step": 1017, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.201815", "step": 1017, "epoch": 2 }, { "type": "loss", "content": 0.021229689940810204, "timestamp": "2025-10-01 04:12:21.214308", "step": 1018, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.265543", "step": 1018, "epoch": 2 }, { "type": "loss", "content": 0.011558206751942635, "timestamp": "2025-10-01 04:12:21.274292", "step": 1019, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.323819", "step": 1019, "epoch": 2 }, { "type": "loss", "content": 0.019588660448789597, "timestamp": "2025-10-01 04:12:21.360075", "step": 1020, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.410793", "step": 1020, "epoch": 2 }, { "type": "loss", "content": 0.028030814602971077, "timestamp": "2025-10-01 04:12:21.424067", "step": 1021, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.498520", "step": 1021, "epoch": 2 }, { "type": "loss", "content": 0.03801051899790764, "timestamp": "2025-10-01 04:12:21.507802", "step": 1022, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.560572", "step": 1022, "epoch": 2 }, { "type": "loss", "content": 0.002824042458087206, "timestamp": "2025-10-01 04:12:21.568695", "step": 1023, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:21.621389", "step": 1023, "epoch": 2 }, { "type": "loss", "content": 0.011485311202704906, "timestamp": "2025-10-01 04:12:21.653646", "step": 1024, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:21.702889", "step": 1024, "epoch": 2 }, { "type": "loss", "content": 0.04113561287522316, "timestamp": "2025-10-01 04:12:21.710504", "step": 1025, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:21.761086", "step": 1025, "epoch": 2 }, { "type": "loss", "content": 0.05204043537378311, "timestamp": "2025-10-01 04:12:21.770999", "step": 1026, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:23.230234", "step": 1026, "epoch": 2 }, { "type": "pplx", "content": 74534656.95469436, "timestamp": "2025-10-01 04:12:23.241058", "step": 1026, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:23.282490", "step": 1026, "epoch": 2 }, { "type": "loss", "content": 0.060518063604831696, "timestamp": "2025-10-01 04:12:23.286656", "step": 1027, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:23.319127", "step": 1027, "epoch": 2 }, { "type": "loss", "content": 0.003438900923356414, "timestamp": "2025-10-01 04:12:23.357621", "step": 1028, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:23.402497", "step": 1028, "epoch": 2 }, { "type": "loss", "content": 0.016376933082938194, "timestamp": "2025-10-01 04:12:23.414993", "step": 1029, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:23.463369", "step": 1029, "epoch": 2 }, { "type": "loss", "content": 0.01734158955514431, "timestamp": "2025-10-01 04:12:23.477727", "step": 1030, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:23.535829", "step": 1030, "epoch": 2 }, { "type": "loss", "content": 0.013855542056262493, "timestamp": "2025-10-01 04:12:23.547595", "step": 1031, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:23.607960", "step": 1031, "epoch": 2 }, { "type": "loss", "content": 0.003171414602547884, "timestamp": "2025-10-01 04:12:23.641584", "step": 1032, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:23.703210", "step": 1032, "epoch": 2 }, { "type": "loss", "content": 0.01719476841390133, "timestamp": "2025-10-01 04:12:23.714948", "step": 1033, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:23.768955", "step": 1033, "epoch": 2 }, { "type": "loss", "content": 0.017137866467237473, "timestamp": "2025-10-01 04:12:23.772655", "step": 1034, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:23.811091", "step": 1034, "epoch": 2 }, { "type": "loss", "content": 0.01853066310286522, "timestamp": "2025-10-01 04:12:23.821087", "step": 1035, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:23.872087", "step": 1035, "epoch": 2 }, { "type": "loss", "content": 0.005616360809653997, "timestamp": "2025-10-01 04:12:23.903414", "step": 1036, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:23.955855", "step": 1036, "epoch": 2 }, { "type": "loss", "content": 0.008777479641139507, "timestamp": "2025-10-01 04:12:23.959796", "step": 1037, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.008614", "step": 1037, "epoch": 2 }, { "type": "loss", "content": 0.0419398732483387, "timestamp": "2025-10-01 04:12:24.018868", "step": 1038, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:24.068646", "step": 1038, "epoch": 2 }, { "type": "loss", "content": 0.01611681841313839, "timestamp": "2025-10-01 04:12:24.076779", "step": 1039, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:24.129063", "step": 1039, "epoch": 2 }, { "type": "loss", "content": 0.027612056583166122, "timestamp": "2025-10-01 04:12:24.158487", "step": 1040, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.201230", "step": 1040, "epoch": 2 }, { "type": "loss", "content": 0.014357500709593296, "timestamp": "2025-10-01 04:12:24.207431", "step": 1041, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.258529", "step": 1041, "epoch": 2 }, { "type": "loss", "content": 0.009304146282374859, "timestamp": "2025-10-01 04:12:24.267547", "step": 1042, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.312047", "step": 1042, "epoch": 2 }, { "type": "loss", "content": 0.03618435934185982, "timestamp": "2025-10-01 04:12:24.316876", "step": 1043, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:24.363088", "step": 1043, "epoch": 2 }, { "type": "loss", "content": 0.012262886390089989, "timestamp": "2025-10-01 04:12:24.387460", "step": 1044, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:24.435236", "step": 1044, "epoch": 2 }, { "type": "loss", "content": 0.007298172917217016, "timestamp": "2025-10-01 04:12:24.443779", "step": 1045, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.502207", "step": 1045, "epoch": 2 }, { "type": "loss", "content": 0.012754367664456367, "timestamp": "2025-10-01 04:12:24.508197", "step": 1046, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.556287", "step": 1046, "epoch": 2 }, { "type": "loss", "content": 0.03507836535573006, "timestamp": "2025-10-01 04:12:24.566906", "step": 1047, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.624464", "step": 1047, "epoch": 2 }, { "type": "loss", "content": 0.012025631964206696, "timestamp": "2025-10-01 04:12:24.655464", "step": 1048, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:24.710791", "step": 1048, "epoch": 2 }, { "type": "loss", "content": 0.009308640845119953, "timestamp": "2025-10-01 04:12:24.720986", "step": 1049, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:24.767803", "step": 1049, "epoch": 2 }, { "type": "loss", "content": 0.014288820326328278, "timestamp": "2025-10-01 04:12:24.775169", "step": 1050, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.821807", "step": 1050, "epoch": 2 }, { "type": "loss", "content": 0.02874881587922573, "timestamp": "2025-10-01 04:12:24.829322", "step": 1051, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:24.879997", "step": 1051, "epoch": 2 }, { "type": "loss", "content": 0.011433676816523075, "timestamp": "2025-10-01 04:12:24.908934", "step": 1052, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:24.957889", "step": 1052, "epoch": 2 }, { "type": "loss", "content": 0.01906905137002468, "timestamp": "2025-10-01 04:12:24.961964", "step": 1053, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:25.013911", "step": 1053, "epoch": 2 }, { "type": "loss", "content": 0.03443703427910805, "timestamp": "2025-10-01 04:12:25.022468", "step": 1054, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.065818", "step": 1054, "epoch": 2 }, { "type": "loss", "content": 0.02929351106286049, "timestamp": "2025-10-01 04:12:25.075300", "step": 1055, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.127217", "step": 1055, "epoch": 2 }, { "type": "loss", "content": 0.03993804007768631, "timestamp": "2025-10-01 04:12:25.156783", "step": 1056, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.208876", "step": 1056, "epoch": 2 }, { "type": "loss", "content": 0.021182114258408546, "timestamp": "2025-10-01 04:12:25.216179", "step": 1057, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.262382", "step": 1057, "epoch": 2 }, { "type": "loss", "content": 0.024790504947304726, "timestamp": "2025-10-01 04:12:25.265465", "step": 1058, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.305899", "step": 1058, "epoch": 2 }, { "type": "loss", "content": 0.03713010996580124, "timestamp": "2025-10-01 04:12:25.313698", "step": 1059, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.360189", "step": 1059, "epoch": 2 }, { "type": "loss", "content": 0.01229268778115511, "timestamp": "2025-10-01 04:12:25.389744", "step": 1060, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.436339", "step": 1060, "epoch": 2 }, { "type": "loss", "content": 0.013694092631340027, "timestamp": "2025-10-01 04:12:25.449497", "step": 1061, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.506737", "step": 1061, "epoch": 2 }, { "type": "loss", "content": 0.01466763112694025, "timestamp": "2025-10-01 04:12:25.514043", "step": 1062, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.561668", "step": 1062, "epoch": 2 }, { "type": "loss", "content": 0.006644328590482473, "timestamp": "2025-10-01 04:12:25.564531", "step": 1063, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.614569", "step": 1063, "epoch": 2 }, { "type": "loss", "content": 0.009213857352733612, "timestamp": "2025-10-01 04:12:25.648593", "step": 1064, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.706214", "step": 1064, "epoch": 2 }, { "type": "loss", "content": 0.03488415107131004, "timestamp": "2025-10-01 04:12:25.717488", "step": 1065, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:25.782296", "step": 1065, "epoch": 2 }, { "type": "loss", "content": 0.02670764923095703, "timestamp": "2025-10-01 04:12:25.794108", "step": 1066, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.856101", "step": 1066, "epoch": 2 }, { "type": "loss", "content": 0.026423903182148933, "timestamp": "2025-10-01 04:12:25.859779", "step": 1067, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.902843", "step": 1067, "epoch": 2 }, { "type": "loss", "content": 0.012425941415131092, "timestamp": "2025-10-01 04:12:25.938403", "step": 1068, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:25.997755", "step": 1068, "epoch": 2 }, { "type": "loss", "content": 0.004534396808594465, "timestamp": "2025-10-01 04:12:26.010122", "step": 1069, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:26.070158", "step": 1069, "epoch": 2 }, { "type": "loss", "content": 0.012708894908428192, "timestamp": "2025-10-01 04:12:26.083392", "step": 1070, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:26.143118", "step": 1070, "epoch": 2 }, { "type": "loss", "content": 0.020166311413049698, "timestamp": "2025-10-01 04:12:26.154906", "step": 1071, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.213791", "step": 1071, "epoch": 2 }, { "type": "loss", "content": 0.026705464348196983, "timestamp": "2025-10-01 04:12:26.246010", "step": 1072, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.306666", "step": 1072, "epoch": 2 }, { "type": "loss", "content": 0.01093844510614872, "timestamp": "2025-10-01 04:12:26.314580", "step": 1073, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.365533", "step": 1073, "epoch": 2 }, { "type": "loss", "content": 0.04105237126350403, "timestamp": "2025-10-01 04:12:26.374837", "step": 1074, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.422483", "step": 1074, "epoch": 2 }, { "type": "loss", "content": 0.05717204138636589, "timestamp": "2025-10-01 04:12:26.430254", "step": 1075, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:26.478912", "step": 1075, "epoch": 2 }, { "type": "loss", "content": 0.04415814206004143, "timestamp": "2025-10-01 04:12:26.509652", "step": 1076, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.558489", "step": 1076, "epoch": 2 }, { "type": "loss", "content": 0.0289046261459589, "timestamp": "2025-10-01 04:12:26.568501", "step": 1077, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.619976", "step": 1077, "epoch": 2 }, { "type": "loss", "content": 0.022754548117518425, "timestamp": "2025-10-01 04:12:26.629759", "step": 1078, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.675199", "step": 1078, "epoch": 2 }, { "type": "loss", "content": 0.018774649128317833, "timestamp": "2025-10-01 04:12:26.681482", "step": 1079, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.731949", "step": 1079, "epoch": 2 }, { "type": "loss", "content": 0.01819383166730404, "timestamp": "2025-10-01 04:12:26.759080", "step": 1080, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:26.814991", "step": 1080, "epoch": 2 }, { "type": "loss", "content": 0.020636798813939095, "timestamp": "2025-10-01 04:12:26.825870", "step": 1081, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:26.880369", "step": 1081, "epoch": 2 }, { "type": "loss", "content": 0.02792542800307274, "timestamp": "2025-10-01 04:12:26.890526", "step": 1082, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:26.949859", "step": 1082, "epoch": 2 }, { "type": "loss", "content": 0.01891268976032734, "timestamp": "2025-10-01 04:12:26.954700", "step": 1083, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:28.175094", "step": 1083, "epoch": 2 }, { "type": "pplx", "content": 62566369.90442915, "timestamp": "2025-10-01 04:12:28.182108", "step": 1083, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.219753", "step": 1083, "epoch": 2 }, { "type": "loss", "content": 0.013113982044160366, "timestamp": "2025-10-01 04:12:28.246671", "step": 1084, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.300031", "step": 1084, "epoch": 2 }, { "type": "loss", "content": 0.006288694683462381, "timestamp": "2025-10-01 04:12:28.306128", "step": 1085, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.350994", "step": 1085, "epoch": 2 }, { "type": "loss", "content": 0.020121892914175987, "timestamp": "2025-10-01 04:12:28.357824", "step": 1086, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.398651", "step": 1086, "epoch": 2 }, { "type": "loss", "content": 0.027080422267317772, "timestamp": "2025-10-01 04:12:28.404013", "step": 1087, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:28.453476", "step": 1087, "epoch": 2 }, { "type": "loss", "content": 0.009238681755959988, "timestamp": "2025-10-01 04:12:28.480708", "step": 1088, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:28.525283", "step": 1088, "epoch": 2 }, { "type": "loss", "content": 0.033649567514657974, "timestamp": "2025-10-01 04:12:28.532996", "step": 1089, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.576999", "step": 1089, "epoch": 2 }, { "type": "loss", "content": 0.021768080070614815, "timestamp": "2025-10-01 04:12:28.585337", "step": 1090, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:28.628749", "step": 1090, "epoch": 2 }, { "type": "loss", "content": 0.04176913574337959, "timestamp": "2025-10-01 04:12:28.633636", "step": 1091, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.678793", "step": 1091, "epoch": 2 }, { "type": "loss", "content": 0.007122901733964682, "timestamp": "2025-10-01 04:12:28.706112", "step": 1092, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:28.742425", "step": 1092, "epoch": 2 }, { "type": "loss", "content": 0.017248356714844704, "timestamp": "2025-10-01 04:12:28.746780", "step": 1093, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:28.793813", "step": 1093, "epoch": 2 }, { "type": "loss", "content": 0.011409069411456585, "timestamp": "2025-10-01 04:12:28.800770", "step": 1094, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:28.844952", "step": 1094, "epoch": 2 }, { "type": "loss", "content": 0.010414226911962032, "timestamp": "2025-10-01 04:12:28.848936", "step": 1095, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:28.890495", "step": 1095, "epoch": 2 }, { "type": "loss", "content": 0.005965986289083958, "timestamp": "2025-10-01 04:12:28.916790", "step": 1096, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:28.961079", "step": 1096, "epoch": 2 }, { "type": "loss", "content": 0.019725624471902847, "timestamp": "2025-10-01 04:12:28.965573", "step": 1097, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.009299", "step": 1097, "epoch": 2 }, { "type": "loss", "content": 0.019195079803466797, "timestamp": "2025-10-01 04:12:29.017144", "step": 1098, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.062348", "step": 1098, "epoch": 2 }, { "type": "loss", "content": 0.020952671766281128, "timestamp": "2025-10-01 04:12:29.068559", "step": 1099, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.110221", "step": 1099, "epoch": 2 }, { "type": "loss", "content": 0.027296748012304306, "timestamp": "2025-10-01 04:12:29.137134", "step": 1100, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:29.179356", "step": 1100, "epoch": 2 }, { "type": "loss", "content": 0.02847483567893505, "timestamp": "2025-10-01 04:12:29.183252", "step": 1101, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:29.227678", "step": 1101, "epoch": 2 }, { "type": "loss", "content": 0.01081875804811716, "timestamp": "2025-10-01 04:12:29.242281", "step": 1102, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.318405", "step": 1102, "epoch": 2 }, { "type": "loss", "content": 0.021221021190285683, "timestamp": "2025-10-01 04:12:29.329630", "step": 1103, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.389413", "step": 1103, "epoch": 2 }, { "type": "loss", "content": 0.018782924860715866, "timestamp": "2025-10-01 04:12:29.420176", "step": 1104, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.466310", "step": 1104, "epoch": 2 }, { "type": "loss", "content": 0.012332765385508537, "timestamp": "2025-10-01 04:12:29.476222", "step": 1105, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:29.531474", "step": 1105, "epoch": 2 }, { "type": "loss", "content": 0.011492065154016018, "timestamp": "2025-10-01 04:12:29.540729", "step": 1106, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.585476", "step": 1106, "epoch": 2 }, { "type": "loss", "content": 0.025456393137574196, "timestamp": "2025-10-01 04:12:29.590994", "step": 1107, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.634583", "step": 1107, "epoch": 2 }, { "type": "loss", "content": 0.021532809361815453, "timestamp": "2025-10-01 04:12:29.663363", "step": 1108, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:29.713162", "step": 1108, "epoch": 2 }, { "type": "loss", "content": 0.0116585036739707, "timestamp": "2025-10-01 04:12:29.721922", "step": 1109, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:29.769213", "step": 1109, "epoch": 2 }, { "type": "loss", "content": 0.019244659692049026, "timestamp": "2025-10-01 04:12:29.772276", "step": 1110, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:29.819181", "step": 1110, "epoch": 2 }, { "type": "loss", "content": 0.019055215641856194, "timestamp": "2025-10-01 04:12:29.825545", "step": 1111, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:29.870125", "step": 1111, "epoch": 2 }, { "type": "loss", "content": 0.028476441279053688, "timestamp": "2025-10-01 04:12:29.893903", "step": 1112, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:29.945679", "step": 1112, "epoch": 2 }, { "type": "loss", "content": 0.022367192432284355, "timestamp": "2025-10-01 04:12:29.953438", "step": 1113, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:30.005605", "step": 1113, "epoch": 2 }, { "type": "loss", "content": 0.012778395786881447, "timestamp": "2025-10-01 04:12:30.014956", "step": 1114, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:30.062455", "step": 1114, "epoch": 2 }, { "type": "loss", "content": 0.02362431026995182, "timestamp": "2025-10-01 04:12:30.069487", "step": 1115, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:30.112843", "step": 1115, "epoch": 2 }, { "type": "loss", "content": 0.0172555074095726, "timestamp": "2025-10-01 04:12:30.147296", "step": 1116, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.184903", "step": 1116, "epoch": 2 }, { "type": "loss", "content": 0.026192547753453255, "timestamp": "2025-10-01 04:12:30.192051", "step": 1117, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:30.237473", "step": 1117, "epoch": 2 }, { "type": "loss", "content": 0.014290613122284412, "timestamp": "2025-10-01 04:12:30.247066", "step": 1118, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.294475", "step": 1118, "epoch": 2 }, { "type": "loss", "content": 0.021653365343809128, "timestamp": "2025-10-01 04:12:30.303475", "step": 1119, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.351658", "step": 1119, "epoch": 2 }, { "type": "loss", "content": 0.030385613441467285, "timestamp": "2025-10-01 04:12:30.375922", "step": 1120, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:30.423559", "step": 1120, "epoch": 2 }, { "type": "loss", "content": 0.02170766517519951, "timestamp": "2025-10-01 04:12:30.431657", "step": 1121, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:30.477731", "step": 1121, "epoch": 2 }, { "type": "loss", "content": 0.010667859576642513, "timestamp": "2025-10-01 04:12:30.486076", "step": 1122, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.528274", "step": 1122, "epoch": 2 }, { "type": "loss", "content": 0.014071441255509853, "timestamp": "2025-10-01 04:12:30.535707", "step": 1123, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.580818", "step": 1123, "epoch": 2 }, { "type": "loss", "content": 0.014766133390367031, "timestamp": "2025-10-01 04:12:30.606708", "step": 1124, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.664808", "step": 1124, "epoch": 2 }, { "type": "loss", "content": 0.007835105992853642, "timestamp": "2025-10-01 04:12:30.670811", "step": 1125, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.711157", "step": 1125, "epoch": 2 }, { "type": "loss", "content": 0.020053701475262642, "timestamp": "2025-10-01 04:12:30.715814", "step": 1126, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.754530", "step": 1126, "epoch": 2 }, { "type": "loss", "content": 0.025708338245749474, "timestamp": "2025-10-01 04:12:30.758579", "step": 1127, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.798065", "step": 1127, "epoch": 2 }, { "type": "loss", "content": 0.007576673291623592, "timestamp": "2025-10-01 04:12:30.823696", "step": 1128, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.861459", "step": 1128, "epoch": 2 }, { "type": "loss", "content": 0.042808592319488525, "timestamp": "2025-10-01 04:12:30.866053", "step": 1129, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.906239", "step": 1129, "epoch": 2 }, { "type": "loss", "content": 0.032328877598047256, "timestamp": "2025-10-01 04:12:30.910682", "step": 1130, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.947319", "step": 1130, "epoch": 2 }, { "type": "loss", "content": 0.015181933529675007, "timestamp": "2025-10-01 04:12:30.952193", "step": 1131, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:30.992184", "step": 1131, "epoch": 2 }, { "type": "loss", "content": 0.010932973586022854, "timestamp": "2025-10-01 04:12:31.017964", "step": 1132, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.056055", "step": 1132, "epoch": 2 }, { "type": "loss", "content": 0.019718725234270096, "timestamp": "2025-10-01 04:12:31.061331", "step": 1133, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:31.104264", "step": 1133, "epoch": 2 }, { "type": "loss", "content": 0.016067685559391975, "timestamp": "2025-10-01 04:12:31.109066", "step": 1134, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:31.149572", "step": 1134, "epoch": 2 }, { "type": "loss", "content": 0.02257952094078064, "timestamp": "2025-10-01 04:12:31.157413", "step": 1135, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.203158", "step": 1135, "epoch": 2 }, { "type": "loss", "content": 0.011983809992671013, "timestamp": "2025-10-01 04:12:31.231501", "step": 1136, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.273413", "step": 1136, "epoch": 2 }, { "type": "loss", "content": 0.007152553182095289, "timestamp": "2025-10-01 04:12:31.276438", "step": 1137, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.318357", "step": 1137, "epoch": 2 }, { "type": "loss", "content": 0.018866227939724922, "timestamp": "2025-10-01 04:12:31.325994", "step": 1138, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.368558", "step": 1138, "epoch": 2 }, { "type": "loss", "content": 0.011750993318855762, "timestamp": "2025-10-01 04:12:31.371822", "step": 1139, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:31.411580", "step": 1139, "epoch": 2 }, { "type": "loss", "content": 0.01951892487704754, "timestamp": "2025-10-01 04:12:31.439193", "step": 1140, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:32.584175", "step": 1140, "epoch": 2 }, { "type": "pplx", "content": 62764748.542011686, "timestamp": "2025-10-01 04:12:32.588321", "step": 1140, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.624205", "step": 1140, "epoch": 2 }, { "type": "loss", "content": 0.029061226174235344, "timestamp": "2025-10-01 04:12:32.629949", "step": 1141, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.673699", "step": 1141, "epoch": 2 }, { "type": "loss", "content": 0.00953701976686716, "timestamp": "2025-10-01 04:12:32.676602", "step": 1142, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.719807", "step": 1142, "epoch": 2 }, { "type": "loss", "content": 0.01728624477982521, "timestamp": "2025-10-01 04:12:32.727261", "step": 1143, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.772725", "step": 1143, "epoch": 2 }, { "type": "loss", "content": 0.033103007823228836, "timestamp": "2025-10-01 04:12:32.801245", "step": 1144, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:32.844340", "step": 1144, "epoch": 2 }, { "type": "loss", "content": 0.004969587083905935, "timestamp": "2025-10-01 04:12:32.850720", "step": 1145, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:32.893335", "step": 1145, "epoch": 2 }, { "type": "loss", "content": 0.020533567294478416, "timestamp": "2025-10-01 04:12:32.899869", "step": 1146, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.950253", "step": 1146, "epoch": 2 }, { "type": "loss", "content": 0.010829508304595947, "timestamp": "2025-10-01 04:12:32.953914", "step": 1147, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:32.995169", "step": 1147, "epoch": 2 }, { "type": "loss", "content": 0.014489369466900826, "timestamp": "2025-10-01 04:12:33.025395", "step": 1148, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.073039", "step": 1148, "epoch": 2 }, { "type": "loss", "content": 0.02824089117348194, "timestamp": "2025-10-01 04:12:33.077672", "step": 1149, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:33.120786", "step": 1149, "epoch": 2 }, { "type": "loss", "content": 0.012532351538538933, "timestamp": "2025-10-01 04:12:33.127462", "step": 1150, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.172186", "step": 1150, "epoch": 2 }, { "type": "loss", "content": 0.03572159260511398, "timestamp": "2025-10-01 04:12:33.175873", "step": 1151, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.215618", "step": 1151, "epoch": 2 }, { "type": "loss", "content": 0.010852976702153683, "timestamp": "2025-10-01 04:12:33.245624", "step": 1152, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.291717", "step": 1152, "epoch": 2 }, { "type": "loss", "content": 0.012694260105490685, "timestamp": "2025-10-01 04:12:33.301775", "step": 1153, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:33.335968", "step": 1153, "epoch": 2 }, { "type": "loss", "content": 0.011462218128144741, "timestamp": "2025-10-01 04:12:33.339956", "step": 1154, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.371787", "step": 1154, "epoch": 2 }, { "type": "loss", "content": 0.02176797203719616, "timestamp": "2025-10-01 04:12:33.374914", "step": 1155, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.406083", "step": 1155, "epoch": 2 }, { "type": "loss", "content": 0.023000076413154602, "timestamp": "2025-10-01 04:12:33.431149", "step": 1156, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.462980", "step": 1156, "epoch": 2 }, { "type": "loss", "content": 0.013469113036990166, "timestamp": "2025-10-01 04:12:33.465651", "step": 1157, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:33.497278", "step": 1157, "epoch": 2 }, { "type": "loss", "content": 0.011314094997942448, "timestamp": "2025-10-01 04:12:33.499759", "step": 1158, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.531807", "step": 1158, "epoch": 2 }, { "type": "loss", "content": 0.013132537715137005, "timestamp": "2025-10-01 04:12:33.534294", "step": 1159, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.568740", "step": 1159, "epoch": 2 }, { "type": "loss", "content": 0.007822925224900246, "timestamp": "2025-10-01 04:12:33.592753", "step": 1160, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:33.624535", "step": 1160, "epoch": 2 }, { "type": "loss", "content": 0.007950437255203724, "timestamp": "2025-10-01 04:12:33.627429", "step": 1161, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.659359", "step": 1161, "epoch": 2 }, { "type": "loss", "content": 0.023385372012853622, "timestamp": "2025-10-01 04:12:33.663040", "step": 1162, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.696229", "step": 1162, "epoch": 2 }, { "type": "loss", "content": 0.01512717455625534, "timestamp": "2025-10-01 04:12:33.698622", "step": 1163, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:33.729740", "step": 1163, "epoch": 2 }, { "type": "loss", "content": 0.022730054333806038, "timestamp": "2025-10-01 04:12:33.754084", "step": 1164, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.785960", "step": 1164, "epoch": 2 }, { "type": "loss", "content": 0.016662919893860817, "timestamp": "2025-10-01 04:12:33.788657", "step": 1165, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.821178", "step": 1165, "epoch": 2 }, { "type": "loss", "content": 0.01865740306675434, "timestamp": "2025-10-01 04:12:33.823887", "step": 1166, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:33.855044", "step": 1166, "epoch": 2 }, { "type": "loss", "content": 0.010064325295388699, "timestamp": "2025-10-01 04:12:33.857789", "step": 1167, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.889101", "step": 1167, "epoch": 2 }, { "type": "loss", "content": 0.03933363035321236, "timestamp": "2025-10-01 04:12:33.913305", "step": 1168, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:33.946332", "step": 1168, "epoch": 2 }, { "type": "loss", "content": 0.01619604602456093, "timestamp": "2025-10-01 04:12:33.949401", "step": 1169, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:33.981698", "step": 1169, "epoch": 2 }, { "type": "loss", "content": 0.008631623350083828, "timestamp": "2025-10-01 04:12:33.984775", "step": 1170, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:34.016928", "step": 1170, "epoch": 2 }, { "type": "loss", "content": 0.01610485464334488, "timestamp": "2025-10-01 04:12:34.020058", "step": 1171, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:34.054794", "step": 1171, "epoch": 2 }, { "type": "loss", "content": 0.008041969500482082, "timestamp": "2025-10-01 04:12:34.078747", "step": 1172, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:34.111508", "step": 1172, "epoch": 2 }, { "type": "loss", "content": 0.012203284539282322, "timestamp": "2025-10-01 04:12:34.114186", "step": 1173, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.146232", "step": 1173, "epoch": 2 }, { "type": "loss", "content": 0.014361986890435219, "timestamp": "2025-10-01 04:12:34.148827", "step": 1174, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.181487", "step": 1174, "epoch": 2 }, { "type": "loss", "content": 0.01781057007610798, "timestamp": "2025-10-01 04:12:34.183913", "step": 1175, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.217961", "step": 1175, "epoch": 2 }, { "type": "loss", "content": 0.009551103226840496, "timestamp": "2025-10-01 04:12:34.242205", "step": 1176, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:34.274120", "step": 1176, "epoch": 2 }, { "type": "loss", "content": 0.029874419793486595, "timestamp": "2025-10-01 04:12:34.278147", "step": 1177, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.309583", "step": 1177, "epoch": 2 }, { "type": "loss", "content": 0.015719091519713402, "timestamp": "2025-10-01 04:12:34.312253", "step": 1178, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.343774", "step": 1178, "epoch": 2 }, { "type": "loss", "content": 0.027939843013882637, "timestamp": "2025-10-01 04:12:34.346675", "step": 1179, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.380926", "step": 1179, "epoch": 2 }, { "type": "loss", "content": 0.01609652303159237, "timestamp": "2025-10-01 04:12:34.404946", "step": 1180, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.442749", "step": 1180, "epoch": 2 }, { "type": "loss", "content": 0.030004026368260384, "timestamp": "2025-10-01 04:12:34.445012", "step": 1181, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.480249", "step": 1181, "epoch": 2 }, { "type": "loss", "content": 0.03301515057682991, "timestamp": "2025-10-01 04:12:34.483656", "step": 1182, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.524031", "step": 1182, "epoch": 2 }, { "type": "loss", "content": 0.01641576923429966, "timestamp": "2025-10-01 04:12:34.526590", "step": 1183, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:34.559718", "step": 1183, "epoch": 2 }, { "type": "loss", "content": 0.006728612817823887, "timestamp": "2025-10-01 04:12:34.583719", "step": 1184, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.614912", "step": 1184, "epoch": 2 }, { "type": "loss", "content": 0.021417854353785515, "timestamp": "2025-10-01 04:12:34.618043", "step": 1185, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.650819", "step": 1185, "epoch": 2 }, { "type": "loss", "content": 0.014348623342812061, "timestamp": "2025-10-01 04:12:34.653889", "step": 1186, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.686643", "step": 1186, "epoch": 2 }, { "type": "loss", "content": 0.006632161792367697, "timestamp": "2025-10-01 04:12:34.689112", "step": 1187, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.720413", "step": 1187, "epoch": 2 }, { "type": "loss", "content": 0.01136785838752985, "timestamp": "2025-10-01 04:12:34.744434", "step": 1188, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.776530", "step": 1188, "epoch": 2 }, { "type": "loss", "content": 0.011251566000282764, "timestamp": "2025-10-01 04:12:34.779197", "step": 1189, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.811725", "step": 1189, "epoch": 2 }, { "type": "loss", "content": 0.030348431318998337, "timestamp": "2025-10-01 04:12:34.814491", "step": 1190, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:34.849612", "step": 1190, "epoch": 2 }, { "type": "loss", "content": 0.01289790216833353, "timestamp": "2025-10-01 04:12:34.852248", "step": 1191, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.885484", "step": 1191, "epoch": 2 }, { "type": "loss", "content": 0.018557637929916382, "timestamp": "2025-10-01 04:12:34.909400", "step": 1192, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:34.940042", "step": 1192, "epoch": 2 }, { "type": "loss", "content": 0.017000071704387665, "timestamp": "2025-10-01 04:12:34.942079", "step": 1193, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:34.971529", "step": 1193, "epoch": 2 }, { "type": "loss", "content": 0.002277073683217168, "timestamp": "2025-10-01 04:12:34.973523", "step": 1194, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:35.004690", "step": 1194, "epoch": 2 }, { "type": "loss", "content": 0.05108269676566124, "timestamp": "2025-10-01 04:12:35.006447", "step": 1195, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:35.036451", "step": 1195, "epoch": 2 }, { "type": "loss", "content": 0.014600159600377083, "timestamp": "2025-10-01 04:12:35.060051", "step": 1196, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:35.090421", "step": 1196, "epoch": 2 }, { "type": "loss", "content": 0.05920051410794258, "timestamp": "2025-10-01 04:12:35.092657", "step": 1197, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:35.819119", "step": 1197, "epoch": 2 }, { "type": "pplx", "content": 69101049.67867415, "timestamp": "2025-10-01 04:12:35.821244", "step": 1197, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:35.849494", "step": 1197, "epoch": 2 }, { "type": "loss", "content": 0.0065787904895842075, "timestamp": "2025-10-01 04:12:35.851545", "step": 1198, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:35.881617", "step": 1198, "epoch": 2 }, { "type": "loss", "content": 0.011520309373736382, "timestamp": "2025-10-01 04:12:35.883572", "step": 1199, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:35.915336", "step": 1199, "epoch": 2 }, { "type": "loss", "content": 0.023187484592199326, "timestamp": "2025-10-01 04:12:35.939632", "step": 1200, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:35.970263", "step": 1200, "epoch": 2 }, { "type": "loss", "content": 0.052248697727918625, "timestamp": "2025-10-01 04:12:35.972760", "step": 1201, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.004487", "step": 1201, "epoch": 2 }, { "type": "loss", "content": 0.00456772418692708, "timestamp": "2025-10-01 04:12:36.006449", "step": 1202, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.036235", "step": 1202, "epoch": 2 }, { "type": "loss", "content": 0.0023930370807647705, "timestamp": "2025-10-01 04:12:36.038260", "step": 1203, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:36.068377", "step": 1203, "epoch": 2 }, { "type": "loss", "content": 0.004792022053152323, "timestamp": "2025-10-01 04:12:36.092087", "step": 1204, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:36.122527", "step": 1204, "epoch": 2 }, { "type": "loss", "content": 0.03757337108254433, "timestamp": "2025-10-01 04:12:36.124523", "step": 1205, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.154170", "step": 1205, "epoch": 2 }, { "type": "loss", "content": 0.03808292746543884, "timestamp": "2025-10-01 04:12:36.156368", "step": 1206, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.186193", "step": 1206, "epoch": 2 }, { "type": "loss", "content": 0.0030230432748794556, "timestamp": "2025-10-01 04:12:36.188496", "step": 1207, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.219100", "step": 1207, "epoch": 2 }, { "type": "loss", "content": 0.015412569046020508, "timestamp": "2025-10-01 04:12:36.242563", "step": 1208, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:36.272963", "step": 1208, "epoch": 2 }, { "type": "loss", "content": 0.014492099173367023, "timestamp": "2025-10-01 04:12:36.275009", "step": 1209, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.305306", "step": 1209, "epoch": 2 }, { "type": "loss", "content": 0.018064534291625023, "timestamp": "2025-10-01 04:12:36.307387", "step": 1210, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.337538", "step": 1210, "epoch": 2 }, { "type": "loss", "content": 0.03734589368104935, "timestamp": "2025-10-01 04:12:36.339730", "step": 1211, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.369990", "step": 1211, "epoch": 2 }, { "type": "loss", "content": 0.015398058108985424, "timestamp": "2025-10-01 04:12:36.393763", "step": 1212, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:36.424249", "step": 1212, "epoch": 2 }, { "type": "loss", "content": 0.008857528679072857, "timestamp": "2025-10-01 04:12:36.426096", "step": 1213, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.457153", "step": 1213, "epoch": 2 }, { "type": "loss", "content": 0.012673860415816307, "timestamp": "2025-10-01 04:12:36.459215", "step": 1214, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.489247", "step": 1214, "epoch": 2 }, { "type": "loss", "content": 0.01826346106827259, "timestamp": "2025-10-01 04:12:36.491135", "step": 1215, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:36.521267", "step": 1215, "epoch": 2 }, { "type": "loss", "content": 0.022028954699635506, "timestamp": "2025-10-01 04:12:36.545367", "step": 1216, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:36.575524", "step": 1216, "epoch": 2 }, { "type": "loss", "content": 0.013720040209591389, "timestamp": "2025-10-01 04:12:36.577401", "step": 1217, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.607402", "step": 1217, "epoch": 2 }, { "type": "loss", "content": 0.00724714994430542, "timestamp": "2025-10-01 04:12:36.609833", "step": 1218, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:36.639914", "step": 1218, "epoch": 2 }, { "type": "loss", "content": 0.008106841705739498, "timestamp": "2025-10-01 04:12:36.642402", "step": 1219, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.672793", "step": 1219, "epoch": 2 }, { "type": "loss", "content": 0.009874098002910614, "timestamp": "2025-10-01 04:12:36.696779", "step": 1220, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.728949", "step": 1220, "epoch": 2 }, { "type": "loss", "content": 0.01156134158372879, "timestamp": "2025-10-01 04:12:36.731050", "step": 1221, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.762049", "step": 1221, "epoch": 2 }, { "type": "loss", "content": 0.007826543413102627, "timestamp": "2025-10-01 04:12:36.764101", "step": 1222, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.793637", "step": 1222, "epoch": 2 }, { "type": "loss", "content": 0.008319816552102566, "timestamp": "2025-10-01 04:12:36.795713", "step": 1223, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.825563", "step": 1223, "epoch": 2 }, { "type": "loss", "content": 0.010037191212177277, "timestamp": "2025-10-01 04:12:36.849058", "step": 1224, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.879230", "step": 1224, "epoch": 2 }, { "type": "loss", "content": 0.012897380627691746, "timestamp": "2025-10-01 04:12:36.881410", "step": 1225, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.911686", "step": 1225, "epoch": 2 }, { "type": "loss", "content": 0.004419870208948851, "timestamp": "2025-10-01 04:12:36.913889", "step": 1226, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.944695", "step": 1226, "epoch": 2 }, { "type": "loss", "content": 0.011351182125508785, "timestamp": "2025-10-01 04:12:36.946817", "step": 1227, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:36.976795", "step": 1227, "epoch": 2 }, { "type": "loss", "content": 0.016698114573955536, "timestamp": "2025-10-01 04:12:37.000503", "step": 1228, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.030489", "step": 1228, "epoch": 2 }, { "type": "loss", "content": 0.005861933343112469, "timestamp": "2025-10-01 04:12:37.032403", "step": 1229, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.062255", "step": 1229, "epoch": 2 }, { "type": "loss", "content": 0.029591960832476616, "timestamp": "2025-10-01 04:12:37.064641", "step": 1230, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.095276", "step": 1230, "epoch": 2 }, { "type": "loss", "content": 0.008629663847386837, "timestamp": "2025-10-01 04:12:37.097268", "step": 1231, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.127494", "step": 1231, "epoch": 2 }, { "type": "loss", "content": 0.021872971206903458, "timestamp": "2025-10-01 04:12:37.151206", "step": 1232, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.183106", "step": 1232, "epoch": 2 }, { "type": "loss", "content": 0.0034721808042377234, "timestamp": "2025-10-01 04:12:37.185328", "step": 1233, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.215008", "step": 1233, "epoch": 2 }, { "type": "loss", "content": 0.00947582721710205, "timestamp": "2025-10-01 04:12:37.216971", "step": 1234, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:37.246465", "step": 1234, "epoch": 2 }, { "type": "loss", "content": 0.01878984272480011, "timestamp": "2025-10-01 04:12:37.248612", "step": 1235, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.278641", "step": 1235, "epoch": 2 }, { "type": "loss", "content": 0.011062067933380604, "timestamp": "2025-10-01 04:12:37.302135", "step": 1236, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:37.332156", "step": 1236, "epoch": 2 }, { "type": "loss", "content": 0.01161190029233694, "timestamp": "2025-10-01 04:12:37.334066", "step": 1237, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:37.363959", "step": 1237, "epoch": 2 }, { "type": "loss", "content": 0.01545110996812582, "timestamp": "2025-10-01 04:12:37.366521", "step": 1238, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.396744", "step": 1238, "epoch": 2 }, { "type": "loss", "content": 0.02992149256169796, "timestamp": "2025-10-01 04:12:37.399040", "step": 1239, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.428977", "step": 1239, "epoch": 2 }, { "type": "loss", "content": 0.01887401007115841, "timestamp": "2025-10-01 04:12:37.452764", "step": 1240, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:37.483043", "step": 1240, "epoch": 2 }, { "type": "loss", "content": 0.007618209812790155, "timestamp": "2025-10-01 04:12:37.485226", "step": 1241, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:37.515270", "step": 1241, "epoch": 2 }, { "type": "loss", "content": 0.017871305346488953, "timestamp": "2025-10-01 04:12:37.517902", "step": 1242, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.547853", "step": 1242, "epoch": 2 }, { "type": "loss", "content": 0.02309580147266388, "timestamp": "2025-10-01 04:12:37.550234", "step": 1243, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.580443", "step": 1243, "epoch": 2 }, { "type": "loss", "content": 0.023178985342383385, "timestamp": "2025-10-01 04:12:37.604271", "step": 1244, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:37.635392", "step": 1244, "epoch": 2 }, { "type": "loss", "content": 0.028510218486189842, "timestamp": "2025-10-01 04:12:37.637384", "step": 1245, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:37.667795", "step": 1245, "epoch": 2 }, { "type": "loss", "content": 0.02059764787554741, "timestamp": "2025-10-01 04:12:37.670323", "step": 1246, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.700398", "step": 1246, "epoch": 2 }, { "type": "loss", "content": 0.02575409784913063, "timestamp": "2025-10-01 04:12:37.702534", "step": 1247, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:37.732556", "step": 1247, "epoch": 2 }, { "type": "loss", "content": 0.01112529169768095, "timestamp": "2025-10-01 04:12:37.756165", "step": 1248, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.786576", "step": 1248, "epoch": 2 }, { "type": "loss", "content": 0.026837946847081184, "timestamp": "2025-10-01 04:12:37.788497", "step": 1249, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:37.818068", "step": 1249, "epoch": 2 }, { "type": "loss", "content": 0.030917344614863396, "timestamp": "2025-10-01 04:12:37.820132", "step": 1250, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.851712", "step": 1250, "epoch": 2 }, { "type": "loss", "content": 0.022184815257787704, "timestamp": "2025-10-01 04:12:37.853716", "step": 1251, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:37.883225", "step": 1251, "epoch": 2 }, { "type": "loss", "content": 0.009707905352115631, "timestamp": "2025-10-01 04:12:37.907652", "step": 1252, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:37.937552", "step": 1252, "epoch": 2 }, { "type": "loss", "content": 0.003488952526822686, "timestamp": "2025-10-01 04:12:37.939557", "step": 1253, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:37.969523", "step": 1253, "epoch": 2 }, { "type": "loss", "content": 0.00924562569707632, "timestamp": "2025-10-01 04:12:37.971995", "step": 1254, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:38.718639", "step": 1254, "epoch": 2 }, { "type": "pplx", "content": 72587363.3396665, "timestamp": "2025-10-01 04:12:38.722363", "step": 1254, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:38.755399", "step": 1254, "epoch": 2 }, { "type": "loss", "content": 0.007001742720603943, "timestamp": "2025-10-01 04:12:38.759029", "step": 1255, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:38.793427", "step": 1255, "epoch": 2 }, { "type": "loss", "content": 0.027426881715655327, "timestamp": "2025-10-01 04:12:38.822925", "step": 1256, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:38.856954", "step": 1256, "epoch": 2 }, { "type": "loss", "content": 0.01879812777042389, "timestamp": "2025-10-01 04:12:38.860658", "step": 1257, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:38.894720", "step": 1257, "epoch": 2 }, { "type": "loss", "content": 0.01835942268371582, "timestamp": "2025-10-01 04:12:38.900475", "step": 1258, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:38.936837", "step": 1258, "epoch": 2 }, { "type": "loss", "content": 0.009284531697630882, "timestamp": "2025-10-01 04:12:38.940589", "step": 1259, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:38.974956", "step": 1259, "epoch": 2 }, { "type": "loss", "content": 0.01425645500421524, "timestamp": "2025-10-01 04:12:38.998488", "step": 1260, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.034927", "step": 1260, "epoch": 2 }, { "type": "loss", "content": 0.050299111753702164, "timestamp": "2025-10-01 04:12:39.039921", "step": 1261, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.103074", "step": 1261, "epoch": 2 }, { "type": "loss", "content": 0.01674676313996315, "timestamp": "2025-10-01 04:12:39.118546", "step": 1262, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:39.196189", "step": 1262, "epoch": 2 }, { "type": "loss", "content": 0.02486700937151909, "timestamp": "2025-10-01 04:12:39.199387", "step": 1263, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:39.271126", "step": 1263, "epoch": 2 }, { "type": "loss", "content": 0.015348898246884346, "timestamp": "2025-10-01 04:12:39.309601", "step": 1264, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.382649", "step": 1264, "epoch": 2 }, { "type": "loss", "content": 0.01046433113515377, "timestamp": "2025-10-01 04:12:39.397938", "step": 1265, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:39.474775", "step": 1265, "epoch": 2 }, { "type": "loss", "content": 0.013404331170022488, "timestamp": "2025-10-01 04:12:39.487640", "step": 1266, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.559680", "step": 1266, "epoch": 2 }, { "type": "loss", "content": 0.01794937252998352, "timestamp": "2025-10-01 04:12:39.565149", "step": 1267, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.604968", "step": 1267, "epoch": 2 }, { "type": "loss", "content": 0.027332397177815437, "timestamp": "2025-10-01 04:12:39.642227", "step": 1268, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.713955", "step": 1268, "epoch": 2 }, { "type": "loss", "content": 0.016648003831505775, "timestamp": "2025-10-01 04:12:39.727933", "step": 1269, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.801066", "step": 1269, "epoch": 2 }, { "type": "loss", "content": 0.02556251361966133, "timestamp": "2025-10-01 04:12:39.818648", "step": 1270, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:39.862577", "step": 1270, "epoch": 2 }, { "type": "loss", "content": 0.007685820106416941, "timestamp": "2025-10-01 04:12:39.873162", "step": 1271, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:39.937443", "step": 1271, "epoch": 2 }, { "type": "loss", "content": 0.021718529984354973, "timestamp": "2025-10-01 04:12:39.967126", "step": 1272, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.026398", "step": 1272, "epoch": 2 }, { "type": "loss", "content": 0.008440138772130013, "timestamp": "2025-10-01 04:12:40.030765", "step": 1273, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:40.079568", "step": 1273, "epoch": 2 }, { "type": "loss", "content": 0.022746693342924118, "timestamp": "2025-10-01 04:12:40.086934", "step": 1274, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.142492", "step": 1274, "epoch": 2 }, { "type": "loss", "content": 0.007633621338754892, "timestamp": "2025-10-01 04:12:40.153098", "step": 1275, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.208552", "step": 1275, "epoch": 2 }, { "type": "loss", "content": 0.005806318949908018, "timestamp": "2025-10-01 04:12:40.233761", "step": 1276, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:40.274585", "step": 1276, "epoch": 2 }, { "type": "loss", "content": 0.016788044944405556, "timestamp": "2025-10-01 04:12:40.277998", "step": 1277, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:40.334141", "step": 1277, "epoch": 2 }, { "type": "loss", "content": 0.01413695514202118, "timestamp": "2025-10-01 04:12:40.345076", "step": 1278, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.399249", "step": 1278, "epoch": 2 }, { "type": "loss", "content": 0.03610283136367798, "timestamp": "2025-10-01 04:12:40.404084", "step": 1279, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:40.444491", "step": 1279, "epoch": 2 }, { "type": "loss", "content": 0.01976608671247959, "timestamp": "2025-10-01 04:12:40.477011", "step": 1280, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.532785", "step": 1280, "epoch": 2 }, { "type": "loss", "content": 0.010782641358673573, "timestamp": "2025-10-01 04:12:40.543694", "step": 1281, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.599570", "step": 1281, "epoch": 2 }, { "type": "loss", "content": 0.01856873743236065, "timestamp": "2025-10-01 04:12:40.609880", "step": 1282, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:40.663642", "step": 1282, "epoch": 2 }, { "type": "loss", "content": 0.008483139798045158, "timestamp": "2025-10-01 04:12:40.673668", "step": 1283, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:40.737905", "step": 1283, "epoch": 2 }, { "type": "loss", "content": 0.007417923770844936, "timestamp": "2025-10-01 04:12:40.769858", "step": 1284, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.812176", "step": 1284, "epoch": 2 }, { "type": "loss", "content": 0.030951378867030144, "timestamp": "2025-10-01 04:12:40.816743", "step": 1285, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:40.873339", "step": 1285, "epoch": 2 }, { "type": "loss", "content": 0.006046353839337826, "timestamp": "2025-10-01 04:12:40.881636", "step": 1286, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:40.931805", "step": 1286, "epoch": 2 }, { "type": "loss", "content": 0.01013564970344305, "timestamp": "2025-10-01 04:12:40.942893", "step": 1287, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.000376", "step": 1287, "epoch": 2 }, { "type": "loss", "content": 0.005417356733232737, "timestamp": "2025-10-01 04:12:41.032435", "step": 1288, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.086631", "step": 1288, "epoch": 2 }, { "type": "loss", "content": 0.03147173300385475, "timestamp": "2025-10-01 04:12:41.088946", "step": 1289, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:41.133055", "step": 1289, "epoch": 2 }, { "type": "loss", "content": 0.012722591869533062, "timestamp": "2025-10-01 04:12:41.142496", "step": 1290, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.196519", "step": 1290, "epoch": 2 }, { "type": "loss", "content": 0.018516169860959053, "timestamp": "2025-10-01 04:12:41.207964", "step": 1291, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.261168", "step": 1291, "epoch": 2 }, { "type": "loss", "content": 0.015589392744004726, "timestamp": "2025-10-01 04:12:41.293501", "step": 1292, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.342087", "step": 1292, "epoch": 2 }, { "type": "loss", "content": 0.004773109219968319, "timestamp": "2025-10-01 04:12:41.345519", "step": 1293, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.397055", "step": 1293, "epoch": 2 }, { "type": "loss", "content": 0.006328233052045107, "timestamp": "2025-10-01 04:12:41.408769", "step": 1294, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.464362", "step": 1294, "epoch": 2 }, { "type": "loss", "content": 0.00493656238541007, "timestamp": "2025-10-01 04:12:41.476386", "step": 1295, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.534489", "step": 1295, "epoch": 2 }, { "type": "loss", "content": 0.027140628546476364, "timestamp": "2025-10-01 04:12:41.564498", "step": 1296, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:41.607600", "step": 1296, "epoch": 2 }, { "type": "loss", "content": 0.01725948601961136, "timestamp": "2025-10-01 04:12:41.616107", "step": 1297, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.665739", "step": 1297, "epoch": 2 }, { "type": "loss", "content": 0.021593991667032242, "timestamp": "2025-10-01 04:12:41.675446", "step": 1298, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:41.733772", "step": 1298, "epoch": 2 }, { "type": "loss", "content": 0.023603880777955055, "timestamp": "2025-10-01 04:12:41.738912", "step": 1299, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.797366", "step": 1299, "epoch": 2 }, { "type": "loss", "content": 0.013666218146681786, "timestamp": "2025-10-01 04:12:41.826305", "step": 1300, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:41.867111", "step": 1300, "epoch": 2 }, { "type": "loss", "content": 0.02010071836411953, "timestamp": "2025-10-01 04:12:41.876920", "step": 1301, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:41.929900", "step": 1301, "epoch": 2 }, { "type": "loss", "content": 0.011804967187345028, "timestamp": "2025-10-01 04:12:41.938433", "step": 1302, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:41.995283", "step": 1302, "epoch": 2 }, { "type": "loss", "content": 0.014817488379776478, "timestamp": "2025-10-01 04:12:42.005369", "step": 1303, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.063041", "step": 1303, "epoch": 2 }, { "type": "loss", "content": 0.006263951305299997, "timestamp": "2025-10-01 04:12:42.097569", "step": 1304, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.155080", "step": 1304, "epoch": 2 }, { "type": "loss", "content": 0.013046405278146267, "timestamp": "2025-10-01 04:12:42.159911", "step": 1305, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:42.233711", "step": 1305, "epoch": 2 }, { "type": "loss", "content": 0.00669970503076911, "timestamp": "2025-10-01 04:12:42.245833", "step": 1306, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.303415", "step": 1306, "epoch": 2 }, { "type": "loss", "content": 0.015233458951115608, "timestamp": "2025-10-01 04:12:42.309314", "step": 1307, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.362067", "step": 1307, "epoch": 2 }, { "type": "loss", "content": 0.01641346700489521, "timestamp": "2025-10-01 04:12:42.388069", "step": 1308, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.443349", "step": 1308, "epoch": 2 }, { "type": "loss", "content": 0.007500012405216694, "timestamp": "2025-10-01 04:12:42.450010", "step": 1309, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:42.511000", "step": 1309, "epoch": 2 }, { "type": "loss", "content": 0.015264661982655525, "timestamp": "2025-10-01 04:12:42.520989", "step": 1310, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:42.578916", "step": 1310, "epoch": 2 }, { "type": "loss", "content": 0.018528863787651062, "timestamp": "2025-10-01 04:12:42.588548", "step": 1311, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:44.227776", "step": 1311, "epoch": 2 }, { "type": "pplx", "content": 78869573.43853244, "timestamp": "2025-10-01 04:12:44.245101", "step": 1311, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.299810", "step": 1311, "epoch": 2 }, { "type": "loss", "content": 0.0038048250135034323, "timestamp": "2025-10-01 04:12:44.366906", "step": 1312, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.424004", "step": 1312, "epoch": 2 }, { "type": "loss", "content": 0.006456837523728609, "timestamp": "2025-10-01 04:12:44.432596", "step": 1313, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.476682", "step": 1313, "epoch": 2 }, { "type": "loss", "content": 0.0011937960516661406, "timestamp": "2025-10-01 04:12:44.486853", "step": 1314, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:44.561622", "step": 1314, "epoch": 2 }, { "type": "loss", "content": 0.015916863456368446, "timestamp": "2025-10-01 04:12:44.577906", "step": 1315, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.633700", "step": 1315, "epoch": 2 }, { "type": "loss", "content": 0.00806934293359518, "timestamp": "2025-10-01 04:12:44.667446", "step": 1316, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.718982", "step": 1316, "epoch": 2 }, { "type": "loss", "content": 0.01778421550989151, "timestamp": "2025-10-01 04:12:44.730100", "step": 1317, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:44.793413", "step": 1317, "epoch": 2 }, { "type": "loss", "content": 0.0024733245372772217, "timestamp": "2025-10-01 04:12:44.807968", "step": 1318, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:44.867276", "step": 1318, "epoch": 2 }, { "type": "loss", "content": 0.02881716750562191, "timestamp": "2025-10-01 04:12:44.879554", "step": 1319, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:44.951970", "step": 1319, "epoch": 2 }, { "type": "loss", "content": 0.007030692417174578, "timestamp": "2025-10-01 04:12:44.983607", "step": 1320, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.051807", "step": 1320, "epoch": 2 }, { "type": "loss", "content": 0.04227086529135704, "timestamp": "2025-10-01 04:12:45.074219", "step": 1321, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:45.171443", "step": 1321, "epoch": 2 }, { "type": "loss", "content": 0.029042348265647888, "timestamp": "2025-10-01 04:12:45.212720", "step": 1322, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:45.281702", "step": 1322, "epoch": 2 }, { "type": "loss", "content": 0.01102940458804369, "timestamp": "2025-10-01 04:12:45.295208", "step": 1323, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.386960", "step": 1323, "epoch": 2 }, { "type": "loss", "content": 0.006163137499243021, "timestamp": "2025-10-01 04:12:45.426504", "step": 1324, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:45.482658", "step": 1324, "epoch": 2 }, { "type": "loss", "content": 0.0057218037545681, "timestamp": "2025-10-01 04:12:45.509154", "step": 1325, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.571372", "step": 1325, "epoch": 2 }, { "type": "loss", "content": 0.00786365382373333, "timestamp": "2025-10-01 04:12:45.579516", "step": 1326, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.645173", "step": 1326, "epoch": 2 }, { "type": "loss", "content": 0.007059741299599409, "timestamp": "2025-10-01 04:12:45.648177", "step": 1327, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:45.702150", "step": 1327, "epoch": 2 }, { "type": "loss", "content": 0.008876658976078033, "timestamp": "2025-10-01 04:12:45.733461", "step": 1328, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:45.787199", "step": 1328, "epoch": 2 }, { "type": "loss", "content": 0.01941383071243763, "timestamp": "2025-10-01 04:12:45.797192", "step": 1329, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.848611", "step": 1329, "epoch": 2 }, { "type": "loss", "content": 0.005369930062443018, "timestamp": "2025-10-01 04:12:45.862259", "step": 1330, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.920605", "step": 1330, "epoch": 2 }, { "type": "loss", "content": 0.04618501663208008, "timestamp": "2025-10-01 04:12:45.924922", "step": 1331, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:45.981540", "step": 1331, "epoch": 2 }, { "type": "loss", "content": 0.014543636702001095, "timestamp": "2025-10-01 04:12:46.014255", "step": 1332, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.059662", "step": 1332, "epoch": 2 }, { "type": "loss", "content": 0.01306221354752779, "timestamp": "2025-10-01 04:12:46.070833", "step": 1333, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.132546", "step": 1333, "epoch": 2 }, { "type": "loss", "content": 0.016116904094815254, "timestamp": "2025-10-01 04:12:46.142433", "step": 1334, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.200866", "step": 1334, "epoch": 2 }, { "type": "loss", "content": 0.025694649666547775, "timestamp": "2025-10-01 04:12:46.210264", "step": 1335, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.267224", "step": 1335, "epoch": 2 }, { "type": "loss", "content": 0.006386879365891218, "timestamp": "2025-10-01 04:12:46.296373", "step": 1336, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:46.346554", "step": 1336, "epoch": 2 }, { "type": "loss", "content": 0.011319021694362164, "timestamp": "2025-10-01 04:12:46.355555", "step": 1337, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.411148", "step": 1337, "epoch": 2 }, { "type": "loss", "content": 0.016311505809426308, "timestamp": "2025-10-01 04:12:46.419762", "step": 1338, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:46.478960", "step": 1338, "epoch": 2 }, { "type": "loss", "content": 0.003795040538534522, "timestamp": "2025-10-01 04:12:46.486964", "step": 1339, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:46.542375", "step": 1339, "epoch": 2 }, { "type": "loss", "content": 0.014971991069614887, "timestamp": "2025-10-01 04:12:46.572419", "step": 1340, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:46.622727", "step": 1340, "epoch": 2 }, { "type": "loss", "content": 0.02324819751083851, "timestamp": "2025-10-01 04:12:46.632490", "step": 1341, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:46.674567", "step": 1341, "epoch": 2 }, { "type": "loss", "content": 0.009367035701870918, "timestamp": "2025-10-01 04:12:46.685230", "step": 1342, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:46.746431", "step": 1342, "epoch": 2 }, { "type": "loss", "content": 0.008872377686202526, "timestamp": "2025-10-01 04:12:46.756063", "step": 1343, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.812618", "step": 1343, "epoch": 2 }, { "type": "loss", "content": 0.015090836212038994, "timestamp": "2025-10-01 04:12:46.842621", "step": 1344, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.885791", "step": 1344, "epoch": 2 }, { "type": "loss", "content": 0.0024052553344517946, "timestamp": "2025-10-01 04:12:46.888946", "step": 1345, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:46.942596", "step": 1345, "epoch": 2 }, { "type": "loss", "content": 0.015845181420445442, "timestamp": "2025-10-01 04:12:46.952123", "step": 1346, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:46.998233", "step": 1346, "epoch": 2 }, { "type": "loss", "content": 0.015794331207871437, "timestamp": "2025-10-01 04:12:47.001259", "step": 1347, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.049684", "step": 1347, "epoch": 2 }, { "type": "loss", "content": 0.0033900614362210035, "timestamp": "2025-10-01 04:12:47.074828", "step": 1348, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:47.139834", "step": 1348, "epoch": 2 }, { "type": "loss", "content": 0.0025195071939378977, "timestamp": "2025-10-01 04:12:47.144106", "step": 1349, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.198101", "step": 1349, "epoch": 2 }, { "type": "loss", "content": 0.007320540491491556, "timestamp": "2025-10-01 04:12:47.200675", "step": 1350, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:47.268689", "step": 1350, "epoch": 2 }, { "type": "loss", "content": 0.006169808562844992, "timestamp": "2025-10-01 04:12:47.284157", "step": 1351, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.344166", "step": 1351, "epoch": 2 }, { "type": "loss", "content": 0.0029406158719211817, "timestamp": "2025-10-01 04:12:47.375236", "step": 1352, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.443664", "step": 1352, "epoch": 2 }, { "type": "loss", "content": 0.004101304803043604, "timestamp": "2025-10-01 04:12:47.456094", "step": 1353, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.513465", "step": 1353, "epoch": 2 }, { "type": "loss", "content": 0.019129853695631027, "timestamp": "2025-10-01 04:12:47.526344", "step": 1354, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.585727", "step": 1354, "epoch": 2 }, { "type": "loss", "content": 0.022670626640319824, "timestamp": "2025-10-01 04:12:47.595314", "step": 1355, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.649494", "step": 1355, "epoch": 2 }, { "type": "loss", "content": 0.03189031034708023, "timestamp": "2025-10-01 04:12:47.674507", "step": 1356, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.719419", "step": 1356, "epoch": 2 }, { "type": "loss", "content": 0.009254112839698792, "timestamp": "2025-10-01 04:12:47.732305", "step": 1357, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.788157", "step": 1357, "epoch": 2 }, { "type": "loss", "content": 0.008511737920343876, "timestamp": "2025-10-01 04:12:47.799441", "step": 1358, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:47.859103", "step": 1358, "epoch": 2 }, { "type": "loss", "content": 0.02656170353293419, "timestamp": "2025-10-01 04:12:47.873133", "step": 1359, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:47.914929", "step": 1359, "epoch": 2 }, { "type": "loss", "content": 0.003011648776009679, "timestamp": "2025-10-01 04:12:47.950952", "step": 1360, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:47.992653", "step": 1360, "epoch": 2 }, { "type": "loss", "content": 0.03632878139615059, "timestamp": "2025-10-01 04:12:47.999005", "step": 1361, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:48.064544", "step": 1361, "epoch": 2 }, { "type": "loss", "content": 0.01709390990436077, "timestamp": "2025-10-01 04:12:48.076366", "step": 1362, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:48.130187", "step": 1362, "epoch": 2 }, { "type": "loss", "content": 0.00863829255104065, "timestamp": "2025-10-01 04:12:48.133265", "step": 1363, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:48.184623", "step": 1363, "epoch": 2 }, { "type": "loss", "content": 0.003445104230195284, "timestamp": "2025-10-01 04:12:48.211330", "step": 1364, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:48.267189", "step": 1364, "epoch": 2 }, { "type": "loss", "content": 0.0165807344019413, "timestamp": "2025-10-01 04:12:48.278535", "step": 1365, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:48.343826", "step": 1365, "epoch": 2 }, { "type": "loss", "content": 0.006906905211508274, "timestamp": "2025-10-01 04:12:48.352834", "step": 1366, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:48.397252", "step": 1366, "epoch": 2 }, { "type": "loss", "content": 0.01691582053899765, "timestamp": "2025-10-01 04:12:48.400080", "step": 1367, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:48.451326", "step": 1367, "epoch": 2 }, { "type": "loss", "content": 0.011257420293986797, "timestamp": "2025-10-01 04:12:48.480696", "step": 1368, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:49.949607", "step": 1368, "epoch": 2 }, { "type": "pplx", "content": 85059319.48994762, "timestamp": "2025-10-01 04:12:49.959371", "step": 1368, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.006244", "step": 1368, "epoch": 2 }, { "type": "loss", "content": 0.0026699949521571398, "timestamp": "2025-10-01 04:12:50.016682", "step": 1369, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.075039", "step": 1369, "epoch": 2 }, { "type": "loss", "content": 0.0056277550756931305, "timestamp": "2025-10-01 04:12:50.087037", "step": 1370, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:12:50.144334", "step": 1370, "epoch": 2 }, { "type": "loss", "content": 0.008586110547184944, "timestamp": "2025-10-01 04:12:50.154356", "step": 1371, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:50.211740", "step": 1371, "epoch": 2 }, { "type": "loss", "content": 0.0090508246794343, "timestamp": "2025-10-01 04:12:50.243630", "step": 1372, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.297733", "step": 1372, "epoch": 2 }, { "type": "loss", "content": 0.013520614244043827, "timestamp": "2025-10-01 04:12:50.309767", "step": 1373, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.364701", "step": 1373, "epoch": 2 }, { "type": "loss", "content": 0.023014064878225327, "timestamp": "2025-10-01 04:12:50.374906", "step": 1374, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:50.430937", "step": 1374, "epoch": 2 }, { "type": "loss", "content": 0.025585060939192772, "timestamp": "2025-10-01 04:12:50.439301", "step": 1375, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.496777", "step": 1375, "epoch": 2 }, { "type": "loss", "content": 0.006834993604570627, "timestamp": "2025-10-01 04:12:50.521045", "step": 1376, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:50.576978", "step": 1376, "epoch": 2 }, { "type": "loss", "content": 0.027560051530599594, "timestamp": "2025-10-01 04:12:50.587493", "step": 1377, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.648749", "step": 1377, "epoch": 2 }, { "type": "loss", "content": 0.01523479912430048, "timestamp": "2025-10-01 04:12:50.656771", "step": 1378, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.711529", "step": 1378, "epoch": 2 }, { "type": "loss", "content": 0.029804764315485954, "timestamp": "2025-10-01 04:12:50.720572", "step": 1379, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.779679", "step": 1379, "epoch": 2 }, { "type": "loss", "content": 0.0074521261267364025, "timestamp": "2025-10-01 04:12:50.811163", "step": 1380, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.862810", "step": 1380, "epoch": 2 }, { "type": "loss", "content": 0.025462573394179344, "timestamp": "2025-10-01 04:12:50.873074", "step": 1381, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.920966", "step": 1381, "epoch": 2 }, { "type": "loss", "content": 0.019137471914291382, "timestamp": "2025-10-01 04:12:50.932078", "step": 1382, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:50.988472", "step": 1382, "epoch": 2 }, { "type": "loss", "content": 0.022238006815314293, "timestamp": "2025-10-01 04:12:50.998805", "step": 1383, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.057338", "step": 1383, "epoch": 2 }, { "type": "loss", "content": 0.003020750591531396, "timestamp": "2025-10-01 04:12:51.089118", "step": 1384, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.163894", "step": 1384, "epoch": 2 }, { "type": "loss", "content": 0.02706991508603096, "timestamp": "2025-10-01 04:12:51.176846", "step": 1385, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.259214", "step": 1385, "epoch": 2 }, { "type": "loss", "content": 0.028605103492736816, "timestamp": "2025-10-01 04:12:51.262810", "step": 1386, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.315743", "step": 1386, "epoch": 2 }, { "type": "loss", "content": 0.025180544704198837, "timestamp": "2025-10-01 04:12:51.326277", "step": 1387, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.383324", "step": 1387, "epoch": 2 }, { "type": "loss", "content": 0.0067850155755877495, "timestamp": "2025-10-01 04:12:51.414825", "step": 1388, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.471586", "step": 1388, "epoch": 2 }, { "type": "loss", "content": 0.01375055406242609, "timestamp": "2025-10-01 04:12:51.481756", "step": 1389, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:51.548022", "step": 1389, "epoch": 2 }, { "type": "loss", "content": 0.014415652491152287, "timestamp": "2025-10-01 04:12:51.554362", "step": 1390, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:51.605115", "step": 1390, "epoch": 2 }, { "type": "loss", "content": 0.007323476020246744, "timestamp": "2025-10-01 04:12:51.608176", "step": 1391, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:51.663284", "step": 1391, "epoch": 2 }, { "type": "loss", "content": 0.010478307493031025, "timestamp": "2025-10-01 04:12:51.696534", "step": 1392, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.752897", "step": 1392, "epoch": 2 }, { "type": "loss", "content": 0.02164110541343689, "timestamp": "2025-10-01 04:12:51.766314", "step": 1393, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.822442", "step": 1393, "epoch": 2 }, { "type": "loss", "content": 0.023013019934296608, "timestamp": "2025-10-01 04:12:51.837219", "step": 1394, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.910087", "step": 1394, "epoch": 2 }, { "type": "loss", "content": 0.009080707095563412, "timestamp": "2025-10-01 04:12:51.914319", "step": 1395, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:51.980232", "step": 1395, "epoch": 2 }, { "type": "loss", "content": 0.01225399412214756, "timestamp": "2025-10-01 04:12:52.020515", "step": 1396, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.095286", "step": 1396, "epoch": 2 }, { "type": "loss", "content": 0.020442839711904526, "timestamp": "2025-10-01 04:12:52.113470", "step": 1397, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.186285", "step": 1397, "epoch": 2 }, { "type": "loss", "content": 0.047727856785058975, "timestamp": "2025-10-01 04:12:52.201680", "step": 1398, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.240690", "step": 1398, "epoch": 2 }, { "type": "loss", "content": 0.0108518460765481, "timestamp": "2025-10-01 04:12:52.254418", "step": 1399, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:52.308623", "step": 1399, "epoch": 2 }, { "type": "loss", "content": 0.011700905859470367, "timestamp": "2025-10-01 04:12:52.333993", "step": 1400, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.389322", "step": 1400, "epoch": 2 }, { "type": "loss", "content": 0.010864133015275002, "timestamp": "2025-10-01 04:12:52.393341", "step": 1401, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:52.457203", "step": 1401, "epoch": 2 }, { "type": "loss", "content": 0.01521242968738079, "timestamp": "2025-10-01 04:12:52.467015", "step": 1402, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.522928", "step": 1402, "epoch": 2 }, { "type": "loss", "content": 0.009760470129549503, "timestamp": "2025-10-01 04:12:52.534789", "step": 1403, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:52.591165", "step": 1403, "epoch": 2 }, { "type": "loss", "content": 0.0272379107773304, "timestamp": "2025-10-01 04:12:52.624671", "step": 1404, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.684040", "step": 1404, "epoch": 2 }, { "type": "loss", "content": 0.01665526069700718, "timestamp": "2025-10-01 04:12:52.694939", "step": 1405, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.748565", "step": 1405, "epoch": 2 }, { "type": "loss", "content": 0.02335176430642605, "timestamp": "2025-10-01 04:12:52.756002", "step": 1406, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:52.809191", "step": 1406, "epoch": 2 }, { "type": "loss", "content": 0.004674916621297598, "timestamp": "2025-10-01 04:12:52.813487", "step": 1407, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.851729", "step": 1407, "epoch": 2 }, { "type": "loss", "content": 0.013737602159380913, "timestamp": "2025-10-01 04:12:52.877217", "step": 1408, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:52.917518", "step": 1408, "epoch": 2 }, { "type": "loss", "content": 0.018468055874109268, "timestamp": "2025-10-01 04:12:52.924918", "step": 1409, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:52.972744", "step": 1409, "epoch": 2 }, { "type": "loss", "content": 0.004916821606457233, "timestamp": "2025-10-01 04:12:52.980930", "step": 1410, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.019702", "step": 1410, "epoch": 2 }, { "type": "loss", "content": 0.025401007384061813, "timestamp": "2025-10-01 04:12:53.024393", "step": 1411, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.059235", "step": 1411, "epoch": 2 }, { "type": "loss", "content": 0.00917412806302309, "timestamp": "2025-10-01 04:12:53.084790", "step": 1412, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.122164", "step": 1412, "epoch": 2 }, { "type": "loss", "content": 0.009031496942043304, "timestamp": "2025-10-01 04:12:53.131656", "step": 1413, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:53.193011", "step": 1413, "epoch": 2 }, { "type": "loss", "content": 0.002431566594168544, "timestamp": "2025-10-01 04:12:53.202473", "step": 1414, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:53.252552", "step": 1414, "epoch": 2 }, { "type": "loss", "content": 0.02220940962433815, "timestamp": "2025-10-01 04:12:53.261051", "step": 1415, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.311908", "step": 1415, "epoch": 2 }, { "type": "loss", "content": 0.008876879699528217, "timestamp": "2025-10-01 04:12:53.340842", "step": 1416, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.395352", "step": 1416, "epoch": 2 }, { "type": "loss", "content": 0.0041095237247645855, "timestamp": "2025-10-01 04:12:53.405356", "step": 1417, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.456345", "step": 1417, "epoch": 2 }, { "type": "loss", "content": 0.006583545822650194, "timestamp": "2025-10-01 04:12:53.464714", "step": 1418, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:53.517470", "step": 1418, "epoch": 2 }, { "type": "loss", "content": 0.009007715620100498, "timestamp": "2025-10-01 04:12:53.525617", "step": 1419, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.579787", "step": 1419, "epoch": 2 }, { "type": "loss", "content": 0.012834744527935982, "timestamp": "2025-10-01 04:12:53.612024", "step": 1420, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.676965", "step": 1420, "epoch": 2 }, { "type": "loss", "content": 0.017873764038085938, "timestamp": "2025-10-01 04:12:53.687305", "step": 1421, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.750360", "step": 1421, "epoch": 2 }, { "type": "loss", "content": 0.0027618408203125, "timestamp": "2025-10-01 04:12:53.753766", "step": 1422, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:53.809743", "step": 1422, "epoch": 2 }, { "type": "loss", "content": 0.04755884408950806, "timestamp": "2025-10-01 04:12:53.812911", "step": 1423, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:53.865498", "step": 1423, "epoch": 2 }, { "type": "loss", "content": 0.026457469910383224, "timestamp": "2025-10-01 04:12:53.895052", "step": 1424, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:53.946979", "step": 1424, "epoch": 2 }, { "type": "loss", "content": 0.02928990125656128, "timestamp": "2025-10-01 04:12:53.956292", "step": 1425, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:55.274016", "step": 1425, "epoch": 2 }, { "type": "pplx", "content": 81024098.02905558, "timestamp": "2025-10-01 04:12:55.283137", "step": 1425, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.323662", "step": 1425, "epoch": 2 }, { "type": "loss", "content": 0.0035102497786283493, "timestamp": "2025-10-01 04:12:55.331996", "step": 1426, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.383101", "step": 1426, "epoch": 2 }, { "type": "loss", "content": 0.007271726615726948, "timestamp": "2025-10-01 04:12:55.390564", "step": 1427, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.428679", "step": 1427, "epoch": 2 }, { "type": "loss", "content": 0.02840333618223667, "timestamp": "2025-10-01 04:12:55.459060", "step": 1428, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:55.503212", "step": 1428, "epoch": 2 }, { "type": "loss", "content": 0.004031289368867874, "timestamp": "2025-10-01 04:12:55.506514", "step": 1429, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.545395", "step": 1429, "epoch": 2 }, { "type": "loss", "content": 0.01716061308979988, "timestamp": "2025-10-01 04:12:55.553106", "step": 1430, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.602464", "step": 1430, "epoch": 2 }, { "type": "loss", "content": 0.008836604654788971, "timestamp": "2025-10-01 04:12:55.606287", "step": 1431, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.653722", "step": 1431, "epoch": 2 }, { "type": "loss", "content": 0.012098804116249084, "timestamp": "2025-10-01 04:12:55.683282", "step": 1432, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:55.732670", "step": 1432, "epoch": 2 }, { "type": "loss", "content": 0.024643605574965477, "timestamp": "2025-10-01 04:12:55.736455", "step": 1433, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:55.781764", "step": 1433, "epoch": 2 }, { "type": "loss", "content": 0.01903327740728855, "timestamp": "2025-10-01 04:12:55.790016", "step": 1434, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.826490", "step": 1434, "epoch": 2 }, { "type": "loss", "content": 0.016623888164758682, "timestamp": "2025-10-01 04:12:55.835021", "step": 1435, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:55.888600", "step": 1435, "epoch": 2 }, { "type": "loss", "content": 0.009309085085988045, "timestamp": "2025-10-01 04:12:55.919764", "step": 1436, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:55.978186", "step": 1436, "epoch": 2 }, { "type": "loss", "content": 0.01641160063445568, "timestamp": "2025-10-01 04:12:55.985389", "step": 1437, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:56.033625", "step": 1437, "epoch": 2 }, { "type": "loss", "content": 0.013727948069572449, "timestamp": "2025-10-01 04:12:56.042086", "step": 1438, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.092146", "step": 1438, "epoch": 2 }, { "type": "loss", "content": 0.01171187125146389, "timestamp": "2025-10-01 04:12:56.099344", "step": 1439, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.149026", "step": 1439, "epoch": 2 }, { "type": "loss", "content": 0.005448337644338608, "timestamp": "2025-10-01 04:12:56.177600", "step": 1440, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:56.226128", "step": 1440, "epoch": 2 }, { "type": "loss", "content": 0.005468662362545729, "timestamp": "2025-10-01 04:12:56.235762", "step": 1441, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.280850", "step": 1441, "epoch": 2 }, { "type": "loss", "content": 0.006323696114122868, "timestamp": "2025-10-01 04:12:56.289895", "step": 1442, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.360876", "step": 1442, "epoch": 2 }, { "type": "loss", "content": 0.011916747316718102, "timestamp": "2025-10-01 04:12:56.368685", "step": 1443, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:56.422136", "step": 1443, "epoch": 2 }, { "type": "loss", "content": 0.0029194927774369717, "timestamp": "2025-10-01 04:12:56.457307", "step": 1444, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.509042", "step": 1444, "epoch": 2 }, { "type": "loss", "content": 0.009464549832046032, "timestamp": "2025-10-01 04:12:56.511671", "step": 1445, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.558203", "step": 1445, "epoch": 2 }, { "type": "loss", "content": 0.014040338806807995, "timestamp": "2025-10-01 04:12:56.566269", "step": 1446, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.615819", "step": 1446, "epoch": 2 }, { "type": "loss", "content": 0.010641560889780521, "timestamp": "2025-10-01 04:12:56.623893", "step": 1447, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.674826", "step": 1447, "epoch": 2 }, { "type": "loss", "content": 0.015267870388925076, "timestamp": "2025-10-01 04:12:56.706388", "step": 1448, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.746499", "step": 1448, "epoch": 2 }, { "type": "loss", "content": 0.017825011163949966, "timestamp": "2025-10-01 04:12:56.749752", "step": 1449, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.793734", "step": 1449, "epoch": 2 }, { "type": "loss", "content": 0.010337586514651775, "timestamp": "2025-10-01 04:12:56.801961", "step": 1450, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:56.859522", "step": 1450, "epoch": 2 }, { "type": "loss", "content": 0.027803268283605576, "timestamp": "2025-10-01 04:12:56.866101", "step": 1451, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:56.917191", "step": 1451, "epoch": 2 }, { "type": "loss", "content": 0.010486974380910397, "timestamp": "2025-10-01 04:12:56.946933", "step": 1452, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:56.981331", "step": 1452, "epoch": 2 }, { "type": "loss", "content": 0.003227656939998269, "timestamp": "2025-10-01 04:12:56.988088", "step": 1453, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.032161", "step": 1453, "epoch": 2 }, { "type": "loss", "content": 0.01615157537162304, "timestamp": "2025-10-01 04:12:57.040206", "step": 1454, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.090577", "step": 1454, "epoch": 2 }, { "type": "loss", "content": 0.016013627871870995, "timestamp": "2025-10-01 04:12:57.098203", "step": 1455, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.155382", "step": 1455, "epoch": 2 }, { "type": "loss", "content": 0.023992856964468956, "timestamp": "2025-10-01 04:12:57.180844", "step": 1456, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.230465", "step": 1456, "epoch": 2 }, { "type": "loss", "content": 0.010106414556503296, "timestamp": "2025-10-01 04:12:57.240270", "step": 1457, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.281572", "step": 1457, "epoch": 2 }, { "type": "loss", "content": 0.02062460407614708, "timestamp": "2025-10-01 04:12:57.284766", "step": 1458, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.325578", "step": 1458, "epoch": 2 }, { "type": "loss", "content": 0.01863592490553856, "timestamp": "2025-10-01 04:12:57.330741", "step": 1459, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.372236", "step": 1459, "epoch": 2 }, { "type": "loss", "content": 0.004499680362641811, "timestamp": "2025-10-01 04:12:57.395954", "step": 1460, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.427187", "step": 1460, "epoch": 2 }, { "type": "loss", "content": 0.03403583914041519, "timestamp": "2025-10-01 04:12:57.429926", "step": 1461, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.460759", "step": 1461, "epoch": 2 }, { "type": "loss", "content": 0.014832577668130398, "timestamp": "2025-10-01 04:12:57.463370", "step": 1462, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.494456", "step": 1462, "epoch": 2 }, { "type": "loss", "content": 0.022800559177994728, "timestamp": "2025-10-01 04:12:57.496941", "step": 1463, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.527549", "step": 1463, "epoch": 2 }, { "type": "loss", "content": 0.022497637197375298, "timestamp": "2025-10-01 04:12:57.551562", "step": 1464, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:57.581736", "step": 1464, "epoch": 2 }, { "type": "loss", "content": 0.014008629135787487, "timestamp": "2025-10-01 04:12:57.583809", "step": 1465, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.614061", "step": 1465, "epoch": 2 }, { "type": "loss", "content": 0.027394462376832962, "timestamp": "2025-10-01 04:12:57.616241", "step": 1466, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.647161", "step": 1466, "epoch": 2 }, { "type": "loss", "content": 0.03310396149754524, "timestamp": "2025-10-01 04:12:57.649464", "step": 1467, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.680808", "step": 1467, "epoch": 2 }, { "type": "loss", "content": 0.01916583441197872, "timestamp": "2025-10-01 04:12:57.704447", "step": 1468, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.734862", "step": 1468, "epoch": 2 }, { "type": "loss", "content": 0.017947528511285782, "timestamp": "2025-10-01 04:12:57.737181", "step": 1469, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.793127", "step": 1469, "epoch": 2 }, { "type": "loss", "content": 0.026038730517029762, "timestamp": "2025-10-01 04:12:57.797050", "step": 1470, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.836630", "step": 1470, "epoch": 2 }, { "type": "loss", "content": 0.017815809696912766, "timestamp": "2025-10-01 04:12:57.842061", "step": 1471, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:57.893379", "step": 1471, "epoch": 2 }, { "type": "loss", "content": 0.014733510091900826, "timestamp": "2025-10-01 04:12:57.919086", "step": 1472, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:57.961717", "step": 1472, "epoch": 2 }, { "type": "loss", "content": 0.007148542441427708, "timestamp": "2025-10-01 04:12:57.964450", "step": 1473, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:58.004350", "step": 1473, "epoch": 2 }, { "type": "loss", "content": 0.010934674181044102, "timestamp": "2025-10-01 04:12:58.009758", "step": 1474, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:58.050044", "step": 1474, "epoch": 2 }, { "type": "loss", "content": 0.01579052396118641, "timestamp": "2025-10-01 04:12:58.055107", "step": 1475, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:58.097716", "step": 1475, "epoch": 2 }, { "type": "loss", "content": 0.013724747113883495, "timestamp": "2025-10-01 04:12:58.125086", "step": 1476, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:12:58.182801", "step": 1476, "epoch": 2 }, { "type": "loss", "content": 0.006694742478430271, "timestamp": "2025-10-01 04:12:58.191370", "step": 1477, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:58.250453", "step": 1477, "epoch": 2 }, { "type": "loss", "content": 0.010686053894460201, "timestamp": "2025-10-01 04:12:58.258702", "step": 1478, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:58.309810", "step": 1478, "epoch": 2 }, { "type": "loss", "content": 0.01659354753792286, "timestamp": "2025-10-01 04:12:58.316296", "step": 1479, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:58.359184", "step": 1479, "epoch": 2 }, { "type": "loss", "content": 0.013195483945310116, "timestamp": "2025-10-01 04:12:58.386773", "step": 1480, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:12:58.428827", "step": 1480, "epoch": 2 }, { "type": "loss", "content": 0.01653684861958027, "timestamp": "2025-10-01 04:12:58.436691", "step": 1481, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:12:58.481832", "step": 1481, "epoch": 2 }, { "type": "loss", "content": 0.01580851338803768, "timestamp": "2025-10-01 04:12:58.484372", "step": 1482, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:12:59.963200", "step": 1482, "epoch": 2 }, { "type": "pplx", "content": 84949291.70526439, "timestamp": "2025-10-01 04:12:59.974744", "step": 1482, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.018300", "step": 1482, "epoch": 2 }, { "type": "loss", "content": 0.017466850578784943, "timestamp": "2025-10-01 04:13:00.021522", "step": 1483, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.073224", "step": 1483, "epoch": 2 }, { "type": "loss", "content": 0.010928018018603325, "timestamp": "2025-10-01 04:13:00.104591", "step": 1484, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:00.159071", "step": 1484, "epoch": 2 }, { "type": "loss", "content": 0.026852060109376907, "timestamp": "2025-10-01 04:13:00.162297", "step": 1485, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.197149", "step": 1485, "epoch": 2 }, { "type": "loss", "content": 0.016156313940882683, "timestamp": "2025-10-01 04:13:00.204083", "step": 1486, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.257938", "step": 1486, "epoch": 2 }, { "type": "loss", "content": 0.004799918737262487, "timestamp": "2025-10-01 04:13:00.260677", "step": 1487, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.314212", "step": 1487, "epoch": 2 }, { "type": "loss", "content": 0.007315644528716803, "timestamp": "2025-10-01 04:13:00.344293", "step": 1488, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.392897", "step": 1488, "epoch": 2 }, { "type": "loss", "content": 0.012450148351490498, "timestamp": "2025-10-01 04:13:00.404146", "step": 1489, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:00.442020", "step": 1489, "epoch": 2 }, { "type": "loss", "content": 0.025318952277302742, "timestamp": "2025-10-01 04:13:00.444497", "step": 1490, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.497482", "step": 1490, "epoch": 2 }, { "type": "loss", "content": 0.014360993169248104, "timestamp": "2025-10-01 04:13:00.507317", "step": 1491, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.563849", "step": 1491, "epoch": 2 }, { "type": "loss", "content": 0.008939526043832302, "timestamp": "2025-10-01 04:13:00.594529", "step": 1492, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.640984", "step": 1492, "epoch": 2 }, { "type": "loss", "content": 0.004608576186001301, "timestamp": "2025-10-01 04:13:00.644839", "step": 1493, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:00.698159", "step": 1493, "epoch": 2 }, { "type": "loss", "content": 0.016055211424827576, "timestamp": "2025-10-01 04:13:00.701191", "step": 1494, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:00.744780", "step": 1494, "epoch": 2 }, { "type": "loss", "content": 0.0358593687415123, "timestamp": "2025-10-01 04:13:00.753016", "step": 1495, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.810832", "step": 1495, "epoch": 2 }, { "type": "loss", "content": 0.016153011471033096, "timestamp": "2025-10-01 04:13:00.836891", "step": 1496, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.878392", "step": 1496, "epoch": 2 }, { "type": "loss", "content": 0.009653310291469097, "timestamp": "2025-10-01 04:13:00.882264", "step": 1497, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:00.917780", "step": 1497, "epoch": 2 }, { "type": "loss", "content": 0.00823700986802578, "timestamp": "2025-10-01 04:13:00.920772", "step": 1498, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:00.969110", "step": 1498, "epoch": 2 }, { "type": "loss", "content": 0.020457221195101738, "timestamp": "2025-10-01 04:13:00.977680", "step": 1499, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:01.029140", "step": 1499, "epoch": 2 }, { "type": "loss", "content": 0.015353398397564888, "timestamp": "2025-10-01 04:13:01.061924", "step": 1500, "epoch": 2 }, { "type": "info", "content": "Checkpoint saved at step 1500", "timestamp": "2025-10-01 04:13:06.393499", "step": 1500, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:06.442676", "step": 1500, "epoch": 2 }, { "type": "loss", "content": 0.009914493188261986, "timestamp": "2025-10-01 04:13:06.449061", "step": 1501, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.497957", "step": 1501, "epoch": 2 }, { "type": "loss", "content": 0.006775428541004658, "timestamp": "2025-10-01 04:13:06.501114", "step": 1502, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.548913", "step": 1502, "epoch": 2 }, { "type": "loss", "content": 0.007616374175995588, "timestamp": "2025-10-01 04:13:06.554663", "step": 1503, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.603316", "step": 1503, "epoch": 2 }, { "type": "loss", "content": 0.010884781368076801, "timestamp": "2025-10-01 04:13:06.632490", "step": 1504, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:06.667253", "step": 1504, "epoch": 2 }, { "type": "loss", "content": 0.01737024076282978, "timestamp": "2025-10-01 04:13:06.676654", "step": 1505, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.730152", "step": 1505, "epoch": 2 }, { "type": "loss", "content": 0.029502153396606445, "timestamp": "2025-10-01 04:13:06.738106", "step": 1506, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.777838", "step": 1506, "epoch": 2 }, { "type": "loss", "content": 0.03163299709558487, "timestamp": "2025-10-01 04:13:06.786567", "step": 1507, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-10-01 04:13:06.835708", "step": 1507, "epoch": 2 }, { "type": "loss", "content": 0.012678087688982487, "timestamp": "2025-10-01 04:13:06.865222", "step": 1508, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.911776", "step": 1508, "epoch": 2 }, { "type": "loss", "content": 0.0038244540337473154, "timestamp": "2025-10-01 04:13:06.915334", "step": 1509, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:06.967546", "step": 1509, "epoch": 2 }, { "type": "loss", "content": 0.021646136417984962, "timestamp": "2025-10-01 04:13:06.975516", "step": 1510, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:07.021367", "step": 1510, "epoch": 2 }, { "type": "loss", "content": 0.0009849267080426216, "timestamp": "2025-10-01 04:13:07.030207", "step": 1511, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.079350", "step": 1511, "epoch": 2 }, { "type": "loss", "content": 0.009463687427341938, "timestamp": "2025-10-01 04:13:07.107820", "step": 1512, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.169104", "step": 1512, "epoch": 2 }, { "type": "loss", "content": 0.01159172784537077, "timestamp": "2025-10-01 04:13:07.178023", "step": 1513, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:07.225267", "step": 1513, "epoch": 2 }, { "type": "loss", "content": 0.02389632724225521, "timestamp": "2025-10-01 04:13:07.234903", "step": 1514, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.278785", "step": 1514, "epoch": 2 }, { "type": "loss", "content": 0.006701233331114054, "timestamp": "2025-10-01 04:13:07.285697", "step": 1515, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:07.338661", "step": 1515, "epoch": 2 }, { "type": "loss", "content": 0.06936667859554291, "timestamp": "2025-10-01 04:13:07.369101", "step": 1516, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.420893", "step": 1516, "epoch": 2 }, { "type": "loss", "content": 0.009653878398239613, "timestamp": "2025-10-01 04:13:07.426162", "step": 1517, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.475992", "step": 1517, "epoch": 2 }, { "type": "loss", "content": 0.026117833331227303, "timestamp": "2025-10-01 04:13:07.487396", "step": 1518, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.536917", "step": 1518, "epoch": 2 }, { "type": "loss", "content": 0.018482133746147156, "timestamp": "2025-10-01 04:13:07.541204", "step": 1519, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.603762", "step": 1519, "epoch": 2 }, { "type": "loss", "content": 0.002146028447896242, "timestamp": "2025-10-01 04:13:07.634959", "step": 1520, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.685281", "step": 1520, "epoch": 2 }, { "type": "loss", "content": 0.01685287430882454, "timestamp": "2025-10-01 04:13:07.689656", "step": 1521, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.738103", "step": 1521, "epoch": 2 }, { "type": "loss", "content": 0.006739395670592785, "timestamp": "2025-10-01 04:13:07.745658", "step": 1522, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:07.793751", "step": 1522, "epoch": 2 }, { "type": "loss", "content": 0.01353517360985279, "timestamp": "2025-10-01 04:13:07.801902", "step": 1523, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.854972", "step": 1523, "epoch": 2 }, { "type": "loss", "content": 0.012433887459337711, "timestamp": "2025-10-01 04:13:07.885214", "step": 1524, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:07.928123", "step": 1524, "epoch": 2 }, { "type": "loss", "content": 0.008951112627983093, "timestamp": "2025-10-01 04:13:07.931001", "step": 1525, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:07.978806", "step": 1525, "epoch": 2 }, { "type": "loss", "content": 0.011042612604796886, "timestamp": "2025-10-01 04:13:07.988286", "step": 1526, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:08.031557", "step": 1526, "epoch": 2 }, { "type": "loss", "content": 0.010584224946796894, "timestamp": "2025-10-01 04:13:08.035270", "step": 1527, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.087622", "step": 1527, "epoch": 2 }, { "type": "loss", "content": 0.03606724739074707, "timestamp": "2025-10-01 04:13:08.117541", "step": 1528, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.165808", "step": 1528, "epoch": 2 }, { "type": "loss", "content": 0.036213018000125885, "timestamp": "2025-10-01 04:13:08.172208", "step": 1529, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.220980", "step": 1529, "epoch": 2 }, { "type": "loss", "content": 0.0382445752620697, "timestamp": "2025-10-01 04:13:08.228964", "step": 1530, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.274295", "step": 1530, "epoch": 2 }, { "type": "loss", "content": 0.006602444685995579, "timestamp": "2025-10-01 04:13:08.283038", "step": 1531, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.331547", "step": 1531, "epoch": 2 }, { "type": "loss", "content": 0.020674001425504684, "timestamp": "2025-10-01 04:13:08.356355", "step": 1532, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.396906", "step": 1532, "epoch": 2 }, { "type": "loss", "content": 0.0238628126680851, "timestamp": "2025-10-01 04:13:08.399804", "step": 1533, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:08.433448", "step": 1533, "epoch": 2 }, { "type": "loss", "content": 0.04866022616624832, "timestamp": "2025-10-01 04:13:08.441887", "step": 1534, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.489206", "step": 1534, "epoch": 2 }, { "type": "loss", "content": 0.035724934190511703, "timestamp": "2025-10-01 04:13:08.503351", "step": 1535, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.571101", "step": 1535, "epoch": 2 }, { "type": "loss", "content": 0.026798827573657036, "timestamp": "2025-10-01 04:13:08.605571", "step": 1536, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:08.659294", "step": 1536, "epoch": 2 }, { "type": "loss", "content": 0.00810581911355257, "timestamp": "2025-10-01 04:13:08.668819", "step": 1537, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:08.724892", "step": 1537, "epoch": 2 }, { "type": "loss", "content": 0.03755718097090721, "timestamp": "2025-10-01 04:13:08.728316", "step": 1538, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:08.777270", "step": 1538, "epoch": 2 }, { "type": "loss", "content": 0.004284982569515705, "timestamp": "2025-10-01 04:13:08.781917", "step": 1539, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:10.471606", "step": 1539, "epoch": 2 }, { "type": "pplx", "content": 77655848.63616179, "timestamp": "2025-10-01 04:13:10.475558", "step": 1539, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.508860", "step": 1539, "epoch": 2 }, { "type": "loss", "content": 0.016568129882216454, "timestamp": "2025-10-01 04:13:10.533188", "step": 1540, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.576631", "step": 1540, "epoch": 2 }, { "type": "loss", "content": 0.038749318569898605, "timestamp": "2025-10-01 04:13:10.582652", "step": 1541, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.626905", "step": 1541, "epoch": 2 }, { "type": "loss", "content": 0.01315435953438282, "timestamp": "2025-10-01 04:13:10.632906", "step": 1542, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.684731", "step": 1542, "epoch": 2 }, { "type": "loss", "content": 0.0027639740146696568, "timestamp": "2025-10-01 04:13:10.693105", "step": 1543, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.731609", "step": 1543, "epoch": 2 }, { "type": "loss", "content": 0.003936560358852148, "timestamp": "2025-10-01 04:13:10.762627", "step": 1544, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.813420", "step": 1544, "epoch": 2 }, { "type": "loss", "content": 0.005857733078300953, "timestamp": "2025-10-01 04:13:10.821369", "step": 1545, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.872193", "step": 1545, "epoch": 2 }, { "type": "loss", "content": 0.04366546869277954, "timestamp": "2025-10-01 04:13:10.881688", "step": 1546, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.941190", "step": 1546, "epoch": 2 }, { "type": "loss", "content": 0.040080875158309937, "timestamp": "2025-10-01 04:13:10.952569", "step": 1547, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:10.999536", "step": 1547, "epoch": 2 }, { "type": "loss", "content": 0.01445924025028944, "timestamp": "2025-10-01 04:13:11.031193", "step": 1548, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.082481", "step": 1548, "epoch": 2 }, { "type": "loss", "content": 0.005054256413131952, "timestamp": "2025-10-01 04:13:11.086663", "step": 1549, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.131931", "step": 1549, "epoch": 2 }, { "type": "loss", "content": 0.0016825426137074828, "timestamp": "2025-10-01 04:13:11.142622", "step": 1550, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.193041", "step": 1550, "epoch": 2 }, { "type": "loss", "content": 0.01055573858320713, "timestamp": "2025-10-01 04:13:11.200268", "step": 1551, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.253270", "step": 1551, "epoch": 2 }, { "type": "loss", "content": 0.026796722784638405, "timestamp": "2025-10-01 04:13:11.280372", "step": 1552, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.322620", "step": 1552, "epoch": 2 }, { "type": "loss", "content": 0.005125401075929403, "timestamp": "2025-10-01 04:13:11.327414", "step": 1553, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.370658", "step": 1553, "epoch": 2 }, { "type": "loss", "content": 0.001129808253608644, "timestamp": "2025-10-01 04:13:11.376362", "step": 1554, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.427167", "step": 1554, "epoch": 2 }, { "type": "loss", "content": 0.05054508522152901, "timestamp": "2025-10-01 04:13:11.432103", "step": 1555, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.474371", "step": 1555, "epoch": 2 }, { "type": "loss", "content": 0.04554403945803642, "timestamp": "2025-10-01 04:13:11.501745", "step": 1556, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.540115", "step": 1556, "epoch": 2 }, { "type": "loss", "content": 0.0013658460229635239, "timestamp": "2025-10-01 04:13:11.544415", "step": 1557, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.586716", "step": 1557, "epoch": 2 }, { "type": "loss", "content": 0.004094754345715046, "timestamp": "2025-10-01 04:13:11.594196", "step": 1558, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.645190", "step": 1558, "epoch": 2 }, { "type": "loss", "content": 0.012671545147895813, "timestamp": "2025-10-01 04:13:11.647484", "step": 1559, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.685690", "step": 1559, "epoch": 2 }, { "type": "loss", "content": 0.0024722558446228504, "timestamp": "2025-10-01 04:13:11.715832", "step": 1560, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.765199", "step": 1560, "epoch": 2 }, { "type": "loss", "content": 0.040812183171510696, "timestamp": "2025-10-01 04:13:11.773090", "step": 1561, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.814801", "step": 1561, "epoch": 2 }, { "type": "loss", "content": 0.004918430000543594, "timestamp": "2025-10-01 04:13:11.822778", "step": 1562, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:11.873333", "step": 1562, "epoch": 2 }, { "type": "loss", "content": 0.0334562249481678, "timestamp": "2025-10-01 04:13:11.879939", "step": 1563, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:11.932231", "step": 1563, "epoch": 2 }, { "type": "loss", "content": 0.0065500554628670216, "timestamp": "2025-10-01 04:13:11.957013", "step": 1564, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:12.001221", "step": 1564, "epoch": 2 }, { "type": "loss", "content": 0.005265658255666494, "timestamp": "2025-10-01 04:13:12.008699", "step": 1565, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:12.046129", "step": 1565, "epoch": 2 }, { "type": "loss", "content": 0.005237952806055546, "timestamp": "2025-10-01 04:13:12.049451", "step": 1566, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:12.094927", "step": 1566, "epoch": 2 }, { "type": "loss", "content": 0.003421790199354291, "timestamp": "2025-10-01 04:13:12.103295", "step": 1567, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.144690", "step": 1567, "epoch": 2 }, { "type": "loss", "content": 0.02141939476132393, "timestamp": "2025-10-01 04:13:12.172141", "step": 1568, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.228523", "step": 1568, "epoch": 2 }, { "type": "loss", "content": 0.008104095235466957, "timestamp": "2025-10-01 04:13:12.235493", "step": 1569, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.280761", "step": 1569, "epoch": 2 }, { "type": "loss", "content": 0.01682603359222412, "timestamp": "2025-10-01 04:13:12.287455", "step": 1570, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.329120", "step": 1570, "epoch": 2 }, { "type": "loss", "content": 0.02583424746990204, "timestamp": "2025-10-01 04:13:12.333111", "step": 1571, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:12.382025", "step": 1571, "epoch": 2 }, { "type": "loss", "content": 0.011157028377056122, "timestamp": "2025-10-01 04:13:12.410105", "step": 1572, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.448660", "step": 1572, "epoch": 2 }, { "type": "loss", "content": 0.00611527357250452, "timestamp": "2025-10-01 04:13:12.453210", "step": 1573, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.497994", "step": 1573, "epoch": 2 }, { "type": "loss", "content": 0.0019314091186970472, "timestamp": "2025-10-01 04:13:12.503769", "step": 1574, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:12.550452", "step": 1574, "epoch": 2 }, { "type": "loss", "content": 0.021295342594385147, "timestamp": "2025-10-01 04:13:12.561638", "step": 1575, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:12.608409", "step": 1575, "epoch": 2 }, { "type": "loss", "content": 0.005156986881047487, "timestamp": "2025-10-01 04:13:12.638922", "step": 1576, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.681337", "step": 1576, "epoch": 2 }, { "type": "loss", "content": 0.011544455774128437, "timestamp": "2025-10-01 04:13:12.688360", "step": 1577, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.719960", "step": 1577, "epoch": 2 }, { "type": "loss", "content": 0.016041046008467674, "timestamp": "2025-10-01 04:13:12.722928", "step": 1578, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.767531", "step": 1578, "epoch": 2 }, { "type": "loss", "content": 0.004609879106283188, "timestamp": "2025-10-01 04:13:12.771288", "step": 1579, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.810722", "step": 1579, "epoch": 2 }, { "type": "loss", "content": 0.01365666277706623, "timestamp": "2025-10-01 04:13:12.838008", "step": 1580, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:12.877061", "step": 1580, "epoch": 2 }, { "type": "loss", "content": 0.01647232100367546, "timestamp": "2025-10-01 04:13:12.880290", "step": 1581, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.927247", "step": 1581, "epoch": 2 }, { "type": "loss", "content": 0.007607575505971909, "timestamp": "2025-10-01 04:13:12.933615", "step": 1582, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:12.979921", "step": 1582, "epoch": 2 }, { "type": "loss", "content": 0.006439111661165953, "timestamp": "2025-10-01 04:13:12.986838", "step": 1583, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.026906", "step": 1583, "epoch": 2 }, { "type": "loss", "content": 0.011291056871414185, "timestamp": "2025-10-01 04:13:13.051543", "step": 1584, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.093150", "step": 1584, "epoch": 2 }, { "type": "loss", "content": 0.010510903783142567, "timestamp": "2025-10-01 04:13:13.099338", "step": 1585, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.143499", "step": 1585, "epoch": 2 }, { "type": "loss", "content": 0.028885314241051674, "timestamp": "2025-10-01 04:13:13.147899", "step": 1586, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:13.188834", "step": 1586, "epoch": 2 }, { "type": "loss", "content": 0.03779543563723564, "timestamp": "2025-10-01 04:13:13.195054", "step": 1587, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.242634", "step": 1587, "epoch": 2 }, { "type": "loss", "content": 0.006501057185232639, "timestamp": "2025-10-01 04:13:13.276221", "step": 1588, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.313655", "step": 1588, "epoch": 2 }, { "type": "loss", "content": 0.017126483842730522, "timestamp": "2025-10-01 04:13:13.320308", "step": 1589, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.361042", "step": 1589, "epoch": 2 }, { "type": "loss", "content": 0.01926274411380291, "timestamp": "2025-10-01 04:13:13.367872", "step": 1590, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.406104", "step": 1590, "epoch": 2 }, { "type": "loss", "content": 0.026384757831692696, "timestamp": "2025-10-01 04:13:13.412951", "step": 1591, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.457903", "step": 1591, "epoch": 2 }, { "type": "loss", "content": 0.037959903478622437, "timestamp": "2025-10-01 04:13:13.482434", "step": 1592, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.520971", "step": 1592, "epoch": 2 }, { "type": "loss", "content": 0.008754936046898365, "timestamp": "2025-10-01 04:13:13.526987", "step": 1593, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.575983", "step": 1593, "epoch": 2 }, { "type": "loss", "content": 0.01358555257320404, "timestamp": "2025-10-01 04:13:13.582625", "step": 1594, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:13.628764", "step": 1594, "epoch": 2 }, { "type": "loss", "content": 0.011499091051518917, "timestamp": "2025-10-01 04:13:13.634773", "step": 1595, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:13.681097", "step": 1595, "epoch": 2 }, { "type": "loss", "content": 0.014255395159125328, "timestamp": "2025-10-01 04:13:13.704974", "step": 1596, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:14.463635", "step": 1596, "epoch": 2 }, { "type": "pplx", "content": 52347787.02300448, "timestamp": "2025-10-01 04:13:14.467134", "step": 1596, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.497190", "step": 1596, "epoch": 2 }, { "type": "loss", "content": 0.008003183640539646, "timestamp": "2025-10-01 04:13:14.499343", "step": 1597, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:14.530125", "step": 1597, "epoch": 2 }, { "type": "loss", "content": 0.009980769827961922, "timestamp": "2025-10-01 04:13:14.532732", "step": 1598, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:14.563640", "step": 1598, "epoch": 2 }, { "type": "loss", "content": 0.017090851441025734, "timestamp": "2025-10-01 04:13:14.565998", "step": 1599, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.597935", "step": 1599, "epoch": 2 }, { "type": "loss", "content": 0.011963611468672752, "timestamp": "2025-10-01 04:13:14.621268", "step": 1600, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:14.654546", "step": 1600, "epoch": 2 }, { "type": "loss", "content": 0.006779204122722149, "timestamp": "2025-10-01 04:13:14.656764", "step": 1601, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:14.690170", "step": 1601, "epoch": 2 }, { "type": "loss", "content": 0.025302747264504433, "timestamp": "2025-10-01 04:13:14.692544", "step": 1602, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.726423", "step": 1602, "epoch": 2 }, { "type": "loss", "content": 0.017179856076836586, "timestamp": "2025-10-01 04:13:14.728666", "step": 1603, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:14.762226", "step": 1603, "epoch": 2 }, { "type": "loss", "content": 0.005239727441221476, "timestamp": "2025-10-01 04:13:14.786399", "step": 1604, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.818482", "step": 1604, "epoch": 2 }, { "type": "loss", "content": 0.00681100320070982, "timestamp": "2025-10-01 04:13:14.820686", "step": 1605, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.851705", "step": 1605, "epoch": 2 }, { "type": "loss", "content": 0.031561676412820816, "timestamp": "2025-10-01 04:13:14.854625", "step": 1606, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:14.885921", "step": 1606, "epoch": 2 }, { "type": "loss", "content": 0.008977233432233334, "timestamp": "2025-10-01 04:13:14.888366", "step": 1607, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:14.919498", "step": 1607, "epoch": 2 }, { "type": "loss", "content": 0.010205530561506748, "timestamp": "2025-10-01 04:13:14.943001", "step": 1608, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:14.973817", "step": 1608, "epoch": 2 }, { "type": "loss", "content": 0.007750756572932005, "timestamp": "2025-10-01 04:13:14.976302", "step": 1609, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.008690", "step": 1609, "epoch": 2 }, { "type": "loss", "content": 0.027499424293637276, "timestamp": "2025-10-01 04:13:15.010962", "step": 1610, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.041843", "step": 1610, "epoch": 2 }, { "type": "loss", "content": 0.04024529829621315, "timestamp": "2025-10-01 04:13:15.044051", "step": 1611, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.074969", "step": 1611, "epoch": 2 }, { "type": "loss", "content": 0.006539492402225733, "timestamp": "2025-10-01 04:13:15.098707", "step": 1612, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:15.131453", "step": 1612, "epoch": 2 }, { "type": "loss", "content": 0.01586497202515602, "timestamp": "2025-10-01 04:13:15.134249", "step": 1613, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.166162", "step": 1613, "epoch": 2 }, { "type": "loss", "content": 0.007332774344831705, "timestamp": "2025-10-01 04:13:15.168579", "step": 1614, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.200232", "step": 1614, "epoch": 2 }, { "type": "loss", "content": 0.0067348359152674675, "timestamp": "2025-10-01 04:13:15.202679", "step": 1615, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.233556", "step": 1615, "epoch": 2 }, { "type": "loss", "content": 0.020247790962457657, "timestamp": "2025-10-01 04:13:15.257629", "step": 1616, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.288443", "step": 1616, "epoch": 2 }, { "type": "loss", "content": 0.020015602931380272, "timestamp": "2025-10-01 04:13:15.290865", "step": 1617, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:15.321155", "step": 1617, "epoch": 2 }, { "type": "loss", "content": 0.007512770593166351, "timestamp": "2025-10-01 04:13:15.323394", "step": 1618, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.353553", "step": 1618, "epoch": 2 }, { "type": "loss", "content": 0.013618254102766514, "timestamp": "2025-10-01 04:13:15.355771", "step": 1619, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.386919", "step": 1619, "epoch": 2 }, { "type": "loss", "content": 0.013072640635073185, "timestamp": "2025-10-01 04:13:15.410631", "step": 1620, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.441777", "step": 1620, "epoch": 2 }, { "type": "loss", "content": 0.00882615428417921, "timestamp": "2025-10-01 04:13:15.445282", "step": 1621, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:15.476808", "step": 1621, "epoch": 2 }, { "type": "loss", "content": 0.010403069667518139, "timestamp": "2025-10-01 04:13:15.479390", "step": 1622, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:15.513491", "step": 1622, "epoch": 2 }, { "type": "loss", "content": 0.0030257641337811947, "timestamp": "2025-10-01 04:13:15.515779", "step": 1623, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.546806", "step": 1623, "epoch": 2 }, { "type": "loss", "content": 0.004086515866219997, "timestamp": "2025-10-01 04:13:15.570626", "step": 1624, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.600868", "step": 1624, "epoch": 2 }, { "type": "loss", "content": 0.010360230691730976, "timestamp": "2025-10-01 04:13:15.603087", "step": 1625, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.633799", "step": 1625, "epoch": 2 }, { "type": "loss", "content": 0.006260990630835295, "timestamp": "2025-10-01 04:13:15.635965", "step": 1626, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.666052", "step": 1626, "epoch": 2 }, { "type": "loss", "content": 0.047125279903411865, "timestamp": "2025-10-01 04:13:15.668362", "step": 1627, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:15.700288", "step": 1627, "epoch": 2 }, { "type": "loss", "content": 0.008346055634319782, "timestamp": "2025-10-01 04:13:15.724522", "step": 1628, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.757788", "step": 1628, "epoch": 2 }, { "type": "loss", "content": 0.017164109274744987, "timestamp": "2025-10-01 04:13:15.761373", "step": 1629, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.794440", "step": 1629, "epoch": 2 }, { "type": "loss", "content": 0.008173419162631035, "timestamp": "2025-10-01 04:13:15.797100", "step": 1630, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.827211", "step": 1630, "epoch": 2 }, { "type": "loss", "content": 0.008467881008982658, "timestamp": "2025-10-01 04:13:15.829571", "step": 1631, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:15.859988", "step": 1631, "epoch": 2 }, { "type": "loss", "content": 0.03314581885933876, "timestamp": "2025-10-01 04:13:15.883874", "step": 1632, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.914163", "step": 1632, "epoch": 2 }, { "type": "loss", "content": 0.022746745496988297, "timestamp": "2025-10-01 04:13:15.916275", "step": 1633, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.946825", "step": 1633, "epoch": 2 }, { "type": "loss", "content": 0.0051367864944040775, "timestamp": "2025-10-01 04:13:15.948975", "step": 1634, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:15.979537", "step": 1634, "epoch": 2 }, { "type": "loss", "content": 0.009908909909427166, "timestamp": "2025-10-01 04:13:15.981783", "step": 1635, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.013665", "step": 1635, "epoch": 2 }, { "type": "loss", "content": 0.006155069451779127, "timestamp": "2025-10-01 04:13:16.037572", "step": 1636, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:16.068168", "step": 1636, "epoch": 2 }, { "type": "loss", "content": 0.0286524947732687, "timestamp": "2025-10-01 04:13:16.070394", "step": 1637, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.101977", "step": 1637, "epoch": 2 }, { "type": "loss", "content": 0.0015293165342882276, "timestamp": "2025-10-01 04:13:16.119786", "step": 1638, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.152532", "step": 1638, "epoch": 2 }, { "type": "loss", "content": 0.0018288606079295278, "timestamp": "2025-10-01 04:13:16.160485", "step": 1639, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.196914", "step": 1639, "epoch": 2 }, { "type": "loss", "content": 0.0069273049011826515, "timestamp": "2025-10-01 04:13:16.232303", "step": 1640, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.263740", "step": 1640, "epoch": 2 }, { "type": "loss", "content": 0.02906244620680809, "timestamp": "2025-10-01 04:13:16.266019", "step": 1641, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.300037", "step": 1641, "epoch": 2 }, { "type": "loss", "content": 0.03223400190472603, "timestamp": "2025-10-01 04:13:16.317374", "step": 1642, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:16.364037", "step": 1642, "epoch": 2 }, { "type": "loss", "content": 0.01042555458843708, "timestamp": "2025-10-01 04:13:16.366762", "step": 1643, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:16.413526", "step": 1643, "epoch": 2 }, { "type": "loss", "content": 0.011268666945397854, "timestamp": "2025-10-01 04:13:16.438019", "step": 1644, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.476833", "step": 1644, "epoch": 2 }, { "type": "loss", "content": 0.004011114593595266, "timestamp": "2025-10-01 04:13:16.480232", "step": 1645, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.524681", "step": 1645, "epoch": 2 }, { "type": "loss", "content": 0.007196138612926006, "timestamp": "2025-10-01 04:13:16.533262", "step": 1646, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.570305", "step": 1646, "epoch": 2 }, { "type": "loss", "content": 0.014080160297453403, "timestamp": "2025-10-01 04:13:16.573389", "step": 1647, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.609290", "step": 1647, "epoch": 2 }, { "type": "loss", "content": 0.0066165500320494175, "timestamp": "2025-10-01 04:13:16.637774", "step": 1648, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.670959", "step": 1648, "epoch": 2 }, { "type": "loss", "content": 0.023716462776064873, "timestamp": "2025-10-01 04:13:16.675706", "step": 1649, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:16.714798", "step": 1649, "epoch": 2 }, { "type": "loss", "content": 0.00808466225862503, "timestamp": "2025-10-01 04:13:16.717082", "step": 1650, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:16.755241", "step": 1650, "epoch": 2 }, { "type": "loss", "content": 0.0066062286496162415, "timestamp": "2025-10-01 04:13:16.757816", "step": 1651, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:16.815548", "step": 1651, "epoch": 2 }, { "type": "loss", "content": 0.005531540606170893, "timestamp": "2025-10-01 04:13:16.842589", "step": 1652, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:16.886964", "step": 1652, "epoch": 2 }, { "type": "loss", "content": 0.012811918742954731, "timestamp": "2025-10-01 04:13:16.889840", "step": 1653, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:17.749843", "step": 1653, "epoch": 2 }, { "type": "pplx", "content": 56661103.4377224, "timestamp": "2025-10-01 04:13:17.752192", "step": 1653, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:17.782199", "step": 1653, "epoch": 2 }, { "type": "loss", "content": 0.018863413482904434, "timestamp": "2025-10-01 04:13:17.784596", "step": 1654, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:17.818028", "step": 1654, "epoch": 2 }, { "type": "loss", "content": 0.007815408520400524, "timestamp": "2025-10-01 04:13:17.820306", "step": 1655, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:17.850419", "step": 1655, "epoch": 2 }, { "type": "loss", "content": 0.0063620880246162415, "timestamp": "2025-10-01 04:13:17.874417", "step": 1656, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:17.905300", "step": 1656, "epoch": 2 }, { "type": "loss", "content": 0.020800398662686348, "timestamp": "2025-10-01 04:13:17.907589", "step": 1657, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:17.938776", "step": 1657, "epoch": 2 }, { "type": "loss", "content": 0.02517288736999035, "timestamp": "2025-10-01 04:13:17.940894", "step": 1658, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:17.971646", "step": 1658, "epoch": 2 }, { "type": "loss", "content": 0.010495798662304878, "timestamp": "2025-10-01 04:13:17.974010", "step": 1659, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.004415", "step": 1659, "epoch": 2 }, { "type": "loss", "content": 0.0034333032090216875, "timestamp": "2025-10-01 04:13:18.028138", "step": 1660, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:18.058615", "step": 1660, "epoch": 2 }, { "type": "loss", "content": 0.04161619767546654, "timestamp": "2025-10-01 04:13:18.061050", "step": 1661, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.092475", "step": 1661, "epoch": 2 }, { "type": "loss", "content": 0.01029189396649599, "timestamp": "2025-10-01 04:13:18.094882", "step": 1662, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.124991", "step": 1662, "epoch": 2 }, { "type": "loss", "content": 0.004730475600808859, "timestamp": "2025-10-01 04:13:18.127165", "step": 1663, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.158450", "step": 1663, "epoch": 2 }, { "type": "loss", "content": 0.005704338662326336, "timestamp": "2025-10-01 04:13:18.182232", "step": 1664, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.216043", "step": 1664, "epoch": 2 }, { "type": "loss", "content": 0.013141750358045101, "timestamp": "2025-10-01 04:13:18.218090", "step": 1665, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.249036", "step": 1665, "epoch": 2 }, { "type": "loss", "content": 0.015302762389183044, "timestamp": "2025-10-01 04:13:18.251200", "step": 1666, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.282261", "step": 1666, "epoch": 2 }, { "type": "loss", "content": 0.016241682693362236, "timestamp": "2025-10-01 04:13:18.284567", "step": 1667, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.316231", "step": 1667, "epoch": 2 }, { "type": "loss", "content": 0.0158796738833189, "timestamp": "2025-10-01 04:13:18.341517", "step": 1668, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.372234", "step": 1668, "epoch": 2 }, { "type": "loss", "content": 0.03702926263213158, "timestamp": "2025-10-01 04:13:18.374471", "step": 1669, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.404827", "step": 1669, "epoch": 2 }, { "type": "loss", "content": 0.011859017424285412, "timestamp": "2025-10-01 04:13:18.406941", "step": 1670, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:18.440403", "step": 1670, "epoch": 2 }, { "type": "loss", "content": 0.012553932145237923, "timestamp": "2025-10-01 04:13:18.442586", "step": 1671, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:18.472924", "step": 1671, "epoch": 2 }, { "type": "loss", "content": 0.01608966663479805, "timestamp": "2025-10-01 04:13:18.496912", "step": 1672, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.531887", "step": 1672, "epoch": 2 }, { "type": "loss", "content": 0.010410645976662636, "timestamp": "2025-10-01 04:13:18.534089", "step": 1673, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.565628", "step": 1673, "epoch": 2 }, { "type": "loss", "content": 0.003722522873431444, "timestamp": "2025-10-01 04:13:18.568039", "step": 1674, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.599519", "step": 1674, "epoch": 2 }, { "type": "loss", "content": 0.0032592397183179855, "timestamp": "2025-10-01 04:13:18.601890", "step": 1675, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:18.634145", "step": 1675, "epoch": 2 }, { "type": "loss", "content": 0.002561945701017976, "timestamp": "2025-10-01 04:13:18.659400", "step": 1676, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.694416", "step": 1676, "epoch": 2 }, { "type": "loss", "content": 0.008923517540097237, "timestamp": "2025-10-01 04:13:18.696491", "step": 1677, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.729157", "step": 1677, "epoch": 2 }, { "type": "loss", "content": 0.005183252971619368, "timestamp": "2025-10-01 04:13:18.732975", "step": 1678, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:18.770986", "step": 1678, "epoch": 2 }, { "type": "loss", "content": 0.019529495388269424, "timestamp": "2025-10-01 04:13:18.774215", "step": 1679, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.812820", "step": 1679, "epoch": 2 }, { "type": "loss", "content": 0.018010234460234642, "timestamp": "2025-10-01 04:13:18.838502", "step": 1680, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.871249", "step": 1680, "epoch": 2 }, { "type": "loss", "content": 0.004157633520662785, "timestamp": "2025-10-01 04:13:18.873334", "step": 1681, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.905518", "step": 1681, "epoch": 2 }, { "type": "loss", "content": 0.008690069429576397, "timestamp": "2025-10-01 04:13:18.908634", "step": 1682, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.939282", "step": 1682, "epoch": 2 }, { "type": "loss", "content": 0.005145031027495861, "timestamp": "2025-10-01 04:13:18.941810", "step": 1683, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:18.973616", "step": 1683, "epoch": 2 }, { "type": "loss", "content": 0.0030300146900117397, "timestamp": "2025-10-01 04:13:18.997599", "step": 1684, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.029414", "step": 1684, "epoch": 2 }, { "type": "loss", "content": 0.025381917133927345, "timestamp": "2025-10-01 04:13:19.031698", "step": 1685, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.062812", "step": 1685, "epoch": 2 }, { "type": "loss", "content": 0.005756060127168894, "timestamp": "2025-10-01 04:13:19.064963", "step": 1686, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.096585", "step": 1686, "epoch": 2 }, { "type": "loss", "content": 0.03418432176113129, "timestamp": "2025-10-01 04:13:19.099015", "step": 1687, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:19.129810", "step": 1687, "epoch": 2 }, { "type": "loss", "content": 0.04978889226913452, "timestamp": "2025-10-01 04:13:19.153300", "step": 1688, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:19.185013", "step": 1688, "epoch": 2 }, { "type": "loss", "content": 0.001084079034626484, "timestamp": "2025-10-01 04:13:19.187406", "step": 1689, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.218692", "step": 1689, "epoch": 2 }, { "type": "loss", "content": 0.05039183050394058, "timestamp": "2025-10-01 04:13:19.220944", "step": 1690, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.252295", "step": 1690, "epoch": 2 }, { "type": "loss", "content": 0.00032829755218699574, "timestamp": "2025-10-01 04:13:19.254419", "step": 1691, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:19.285597", "step": 1691, "epoch": 2 }, { "type": "loss", "content": 0.027190400287508965, "timestamp": "2025-10-01 04:13:19.310401", "step": 1692, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:19.341228", "step": 1692, "epoch": 2 }, { "type": "loss", "content": 0.013346688821911812, "timestamp": "2025-10-01 04:13:19.343899", "step": 1693, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:19.375400", "step": 1693, "epoch": 2 }, { "type": "loss", "content": 0.0057144020684063435, "timestamp": "2025-10-01 04:13:19.378385", "step": 1694, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.409704", "step": 1694, "epoch": 2 }, { "type": "loss", "content": 0.0006181415519677103, "timestamp": "2025-10-01 04:13:19.412020", "step": 1695, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:19.443084", "step": 1695, "epoch": 2 }, { "type": "loss", "content": 0.05258602276444435, "timestamp": "2025-10-01 04:13:19.467230", "step": 1696, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.498307", "step": 1696, "epoch": 2 }, { "type": "loss", "content": 0.002812599530443549, "timestamp": "2025-10-01 04:13:19.500894", "step": 1697, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.533068", "step": 1697, "epoch": 2 }, { "type": "loss", "content": 0.012100943364202976, "timestamp": "2025-10-01 04:13:19.536128", "step": 1698, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:19.567213", "step": 1698, "epoch": 2 }, { "type": "loss", "content": 0.0027744087856262922, "timestamp": "2025-10-01 04:13:19.569264", "step": 1699, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.599965", "step": 1699, "epoch": 2 }, { "type": "loss", "content": 0.012924875132739544, "timestamp": "2025-10-01 04:13:19.623715", "step": 1700, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:19.654117", "step": 1700, "epoch": 2 }, { "type": "loss", "content": 0.013080582022666931, "timestamp": "2025-10-01 04:13:19.656267", "step": 1701, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.686977", "step": 1701, "epoch": 2 }, { "type": "loss", "content": 0.0027978557627648115, "timestamp": "2025-10-01 04:13:19.689059", "step": 1702, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:19.720209", "step": 1702, "epoch": 2 }, { "type": "loss", "content": 0.0007439813925884664, "timestamp": "2025-10-01 04:13:19.722717", "step": 1703, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.753258", "step": 1703, "epoch": 2 }, { "type": "loss", "content": 0.000717491318937391, "timestamp": "2025-10-01 04:13:19.777331", "step": 1704, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.807758", "step": 1704, "epoch": 2 }, { "type": "loss", "content": 0.007695492822676897, "timestamp": "2025-10-01 04:13:19.809982", "step": 1705, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.840333", "step": 1705, "epoch": 2 }, { "type": "loss", "content": 0.01251158770173788, "timestamp": "2025-10-01 04:13:19.842649", "step": 1706, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:19.873050", "step": 1706, "epoch": 2 }, { "type": "loss", "content": 0.0007180058746598661, "timestamp": "2025-10-01 04:13:19.875098", "step": 1707, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.909658", "step": 1707, "epoch": 2 }, { "type": "loss", "content": 0.018272066488862038, "timestamp": "2025-10-01 04:13:19.934124", "step": 1708, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.965005", "step": 1708, "epoch": 2 }, { "type": "loss", "content": 0.0062143513932824135, "timestamp": "2025-10-01 04:13:19.967004", "step": 1709, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:19.997291", "step": 1709, "epoch": 2 }, { "type": "loss", "content": 0.003083524527028203, "timestamp": "2025-10-01 04:13:19.999376", "step": 1710, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:20.738670", "step": 1710, "epoch": 2 }, { "type": "pplx", "content": 68561080.48393603, "timestamp": "2025-10-01 04:13:20.740900", "step": 1710, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:20.770926", "step": 1710, "epoch": 2 }, { "type": "loss", "content": 0.023226430639624596, "timestamp": "2025-10-01 04:13:20.773223", "step": 1711, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:20.804471", "step": 1711, "epoch": 2 }, { "type": "loss", "content": 0.019120950251817703, "timestamp": "2025-10-01 04:13:20.828991", "step": 1712, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:20.867736", "step": 1712, "epoch": 2 }, { "type": "loss", "content": 0.005029122345149517, "timestamp": "2025-10-01 04:13:20.869929", "step": 1713, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:20.900792", "step": 1713, "epoch": 2 }, { "type": "loss", "content": 0.0059706405736505985, "timestamp": "2025-10-01 04:13:20.903081", "step": 1714, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:20.934588", "step": 1714, "epoch": 2 }, { "type": "loss", "content": 0.003695933148264885, "timestamp": "2025-10-01 04:13:20.936922", "step": 1715, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:20.967642", "step": 1715, "epoch": 2 }, { "type": "loss", "content": 0.009229108691215515, "timestamp": "2025-10-01 04:13:20.991312", "step": 1716, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.023899", "step": 1716, "epoch": 2 }, { "type": "loss", "content": 0.032100412994623184, "timestamp": "2025-10-01 04:13:21.026151", "step": 1717, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.059084", "step": 1717, "epoch": 2 }, { "type": "loss", "content": 0.012853546068072319, "timestamp": "2025-10-01 04:13:21.061434", "step": 1718, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.091500", "step": 1718, "epoch": 2 }, { "type": "loss", "content": 0.01731124520301819, "timestamp": "2025-10-01 04:13:21.094051", "step": 1719, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.124812", "step": 1719, "epoch": 2 }, { "type": "loss", "content": 0.023197930306196213, "timestamp": "2025-10-01 04:13:21.148388", "step": 1720, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.180463", "step": 1720, "epoch": 2 }, { "type": "loss", "content": 0.005342247895896435, "timestamp": "2025-10-01 04:13:21.182601", "step": 1721, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.213459", "step": 1721, "epoch": 2 }, { "type": "loss", "content": 0.00981935765594244, "timestamp": "2025-10-01 04:13:21.215613", "step": 1722, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.246660", "step": 1722, "epoch": 2 }, { "type": "loss", "content": 0.00958984438329935, "timestamp": "2025-10-01 04:13:21.249205", "step": 1723, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.280733", "step": 1723, "epoch": 2 }, { "type": "loss", "content": 0.006270966958254576, "timestamp": "2025-10-01 04:13:21.304568", "step": 1724, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.336570", "step": 1724, "epoch": 2 }, { "type": "loss", "content": 0.0034877455327659845, "timestamp": "2025-10-01 04:13:21.338909", "step": 1725, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:21.369321", "step": 1725, "epoch": 2 }, { "type": "loss", "content": 0.010010543279349804, "timestamp": "2025-10-01 04:13:21.371830", "step": 1726, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:21.402628", "step": 1726, "epoch": 2 }, { "type": "loss", "content": 0.01026623509824276, "timestamp": "2025-10-01 04:13:21.405007", "step": 1727, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.435258", "step": 1727, "epoch": 2 }, { "type": "loss", "content": 0.0038282847963273525, "timestamp": "2025-10-01 04:13:21.459183", "step": 1728, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.489333", "step": 1728, "epoch": 2 }, { "type": "loss", "content": 0.007716696243733168, "timestamp": "2025-10-01 04:13:21.491842", "step": 1729, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.521786", "step": 1729, "epoch": 2 }, { "type": "loss", "content": 0.013947995379567146, "timestamp": "2025-10-01 04:13:21.523495", "step": 1730, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.554055", "step": 1730, "epoch": 2 }, { "type": "loss", "content": 0.008495950140058994, "timestamp": "2025-10-01 04:13:21.556053", "step": 1731, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.588730", "step": 1731, "epoch": 2 }, { "type": "loss", "content": 0.0012179531622678041, "timestamp": "2025-10-01 04:13:21.612653", "step": 1732, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.644163", "step": 1732, "epoch": 2 }, { "type": "loss", "content": 0.004571664612740278, "timestamp": "2025-10-01 04:13:21.646329", "step": 1733, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.677325", "step": 1733, "epoch": 2 }, { "type": "loss", "content": 0.010332281701266766, "timestamp": "2025-10-01 04:13:21.680016", "step": 1734, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.711530", "step": 1734, "epoch": 2 }, { "type": "loss", "content": 0.004366362001746893, "timestamp": "2025-10-01 04:13:21.713653", "step": 1735, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.750472", "step": 1735, "epoch": 2 }, { "type": "loss", "content": 0.006593588273972273, "timestamp": "2025-10-01 04:13:21.774349", "step": 1736, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:21.805952", "step": 1736, "epoch": 2 }, { "type": "loss", "content": 0.011689740233123302, "timestamp": "2025-10-01 04:13:21.810334", "step": 1737, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:21.844644", "step": 1737, "epoch": 2 }, { "type": "loss", "content": 0.01590937003493309, "timestamp": "2025-10-01 04:13:21.847777", "step": 1738, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.890165", "step": 1738, "epoch": 2 }, { "type": "loss", "content": 0.0026656892150640488, "timestamp": "2025-10-01 04:13:21.895407", "step": 1739, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:21.931053", "step": 1739, "epoch": 2 }, { "type": "loss", "content": 0.0040006088092923164, "timestamp": "2025-10-01 04:13:21.954962", "step": 1740, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:21.990483", "step": 1740, "epoch": 2 }, { "type": "loss", "content": 0.02416076697409153, "timestamp": "2025-10-01 04:13:21.992541", "step": 1741, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.024682", "step": 1741, "epoch": 2 }, { "type": "loss", "content": 0.006946610752493143, "timestamp": "2025-10-01 04:13:22.027259", "step": 1742, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.060826", "step": 1742, "epoch": 2 }, { "type": "loss", "content": 0.010857383720576763, "timestamp": "2025-10-01 04:13:22.064105", "step": 1743, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:22.095093", "step": 1743, "epoch": 2 }, { "type": "loss", "content": 0.02855035290122032, "timestamp": "2025-10-01 04:13:22.119146", "step": 1744, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:22.150609", "step": 1744, "epoch": 2 }, { "type": "loss", "content": 0.0031122262589633465, "timestamp": "2025-10-01 04:13:22.154174", "step": 1745, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.186931", "step": 1745, "epoch": 2 }, { "type": "loss", "content": 0.0025793740060180426, "timestamp": "2025-10-01 04:13:22.189117", "step": 1746, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.219949", "step": 1746, "epoch": 2 }, { "type": "loss", "content": 0.002730799140408635, "timestamp": "2025-10-01 04:13:22.222250", "step": 1747, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.253837", "step": 1747, "epoch": 2 }, { "type": "loss", "content": 0.0016422842163592577, "timestamp": "2025-10-01 04:13:22.278145", "step": 1748, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.313150", "step": 1748, "epoch": 2 }, { "type": "loss", "content": 0.0013755019754171371, "timestamp": "2025-10-01 04:13:22.315281", "step": 1749, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.347030", "step": 1749, "epoch": 2 }, { "type": "loss", "content": 0.005876249633729458, "timestamp": "2025-10-01 04:13:22.349291", "step": 1750, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.381730", "step": 1750, "epoch": 2 }, { "type": "loss", "content": 0.013547523878514767, "timestamp": "2025-10-01 04:13:22.383937", "step": 1751, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:22.417666", "step": 1751, "epoch": 2 }, { "type": "loss", "content": 0.0024058823473751545, "timestamp": "2025-10-01 04:13:22.441879", "step": 1752, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.477321", "step": 1752, "epoch": 2 }, { "type": "loss", "content": 0.008786443620920181, "timestamp": "2025-10-01 04:13:22.481635", "step": 1753, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.514109", "step": 1753, "epoch": 2 }, { "type": "loss", "content": 0.037695612758398056, "timestamp": "2025-10-01 04:13:22.520794", "step": 1754, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.553046", "step": 1754, "epoch": 2 }, { "type": "loss", "content": 0.006795606575906277, "timestamp": "2025-10-01 04:13:22.556391", "step": 1755, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.591832", "step": 1755, "epoch": 2 }, { "type": "loss", "content": 0.01414434053003788, "timestamp": "2025-10-01 04:13:22.615965", "step": 1756, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:22.648091", "step": 1756, "epoch": 2 }, { "type": "loss", "content": 0.0011864519910886884, "timestamp": "2025-10-01 04:13:22.650351", "step": 1757, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.685759", "step": 1757, "epoch": 2 }, { "type": "loss", "content": 0.003651248523965478, "timestamp": "2025-10-01 04:13:22.688110", "step": 1758, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.718855", "step": 1758, "epoch": 2 }, { "type": "loss", "content": 0.004483689088374376, "timestamp": "2025-10-01 04:13:22.722450", "step": 1759, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.752697", "step": 1759, "epoch": 2 }, { "type": "loss", "content": 0.01843567192554474, "timestamp": "2025-10-01 04:13:22.777064", "step": 1760, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.807820", "step": 1760, "epoch": 2 }, { "type": "loss", "content": 0.006289682351052761, "timestamp": "2025-10-01 04:13:22.811725", "step": 1761, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.843814", "step": 1761, "epoch": 2 }, { "type": "loss", "content": 0.0012012216029688716, "timestamp": "2025-10-01 04:13:22.846337", "step": 1762, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.877408", "step": 1762, "epoch": 2 }, { "type": "loss", "content": 0.0004856856248807162, "timestamp": "2025-10-01 04:13:22.879897", "step": 1763, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:22.911346", "step": 1763, "epoch": 2 }, { "type": "loss", "content": 0.007547799032181501, "timestamp": "2025-10-01 04:13:22.935664", "step": 1764, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:22.967058", "step": 1764, "epoch": 2 }, { "type": "loss", "content": 0.007898692972958088, "timestamp": "2025-10-01 04:13:22.969615", "step": 1765, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:23.000624", "step": 1765, "epoch": 2 }, { "type": "loss", "content": 0.000819383654743433, "timestamp": "2025-10-01 04:13:23.003621", "step": 1766, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:23.042682", "step": 1766, "epoch": 2 }, { "type": "loss", "content": 0.0007117543718777597, "timestamp": "2025-10-01 04:13:23.045222", "step": 1767, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:23.808408", "step": 1767, "epoch": 2 }, { "type": "pplx", "content": 68241018.03120764, "timestamp": "2025-10-01 04:13:23.810763", "step": 1767, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:23.840481", "step": 1767, "epoch": 2 }, { "type": "loss", "content": 0.0007178504602052271, "timestamp": "2025-10-01 04:13:23.865251", "step": 1768, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:23.897832", "step": 1768, "epoch": 2 }, { "type": "loss", "content": 0.0072966525331139565, "timestamp": "2025-10-01 04:13:23.901304", "step": 1769, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:23.934168", "step": 1769, "epoch": 2 }, { "type": "loss", "content": 0.0020607777405530214, "timestamp": "2025-10-01 04:13:23.937030", "step": 1770, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:23.969218", "step": 1770, "epoch": 2 }, { "type": "loss", "content": 0.0053713517263531685, "timestamp": "2025-10-01 04:13:23.971527", "step": 1771, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.003104", "step": 1771, "epoch": 2 }, { "type": "loss", "content": 0.00041605104343034327, "timestamp": "2025-10-01 04:13:24.028077", "step": 1772, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.060185", "step": 1772, "epoch": 2 }, { "type": "loss", "content": 0.0013556292979046702, "timestamp": "2025-10-01 04:13:24.062501", "step": 1773, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.094690", "step": 1773, "epoch": 2 }, { "type": "loss", "content": 0.008703082799911499, "timestamp": "2025-10-01 04:13:24.096944", "step": 1774, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:24.129949", "step": 1774, "epoch": 2 }, { "type": "loss", "content": 0.012519395910203457, "timestamp": "2025-10-01 04:13:24.132520", "step": 1775, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.164454", "step": 1775, "epoch": 2 }, { "type": "loss", "content": 0.002101104473695159, "timestamp": "2025-10-01 04:13:24.188362", "step": 1776, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.221822", "step": 1776, "epoch": 2 }, { "type": "loss", "content": 0.0025045033544301987, "timestamp": "2025-10-01 04:13:24.224252", "step": 1777, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.256233", "step": 1777, "epoch": 2 }, { "type": "loss", "content": 0.001200907165184617, "timestamp": "2025-10-01 04:13:24.258386", "step": 1778, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:24.290209", "step": 1778, "epoch": 2 }, { "type": "loss", "content": 0.0042821685783565044, "timestamp": "2025-10-01 04:13:24.292482", "step": 1779, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.323204", "step": 1779, "epoch": 2 }, { "type": "loss", "content": 0.001926648779772222, "timestamp": "2025-10-01 04:13:24.347615", "step": 1780, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.379004", "step": 1780, "epoch": 2 }, { "type": "loss", "content": 0.0010534462053328753, "timestamp": "2025-10-01 04:13:24.381470", "step": 1781, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.412959", "step": 1781, "epoch": 2 }, { "type": "loss", "content": 0.003271149704232812, "timestamp": "2025-10-01 04:13:24.415052", "step": 1782, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:24.446892", "step": 1782, "epoch": 2 }, { "type": "loss", "content": 0.00037074877764098346, "timestamp": "2025-10-01 04:13:24.449025", "step": 1783, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.480018", "step": 1783, "epoch": 2 }, { "type": "loss", "content": 0.0007882600766606629, "timestamp": "2025-10-01 04:13:24.503958", "step": 1784, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.534667", "step": 1784, "epoch": 2 }, { "type": "loss", "content": 0.0005073948414064944, "timestamp": "2025-10-01 04:13:24.537978", "step": 1785, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:24.570750", "step": 1785, "epoch": 2 }, { "type": "loss", "content": 0.0011808349518105388, "timestamp": "2025-10-01 04:13:24.574587", "step": 1786, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.605971", "step": 1786, "epoch": 2 }, { "type": "loss", "content": 0.015473189763724804, "timestamp": "2025-10-01 04:13:24.608161", "step": 1787, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.639638", "step": 1787, "epoch": 2 }, { "type": "loss", "content": 0.01565859653055668, "timestamp": "2025-10-01 04:13:24.664111", "step": 1788, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.695807", "step": 1788, "epoch": 2 }, { "type": "loss", "content": 0.000664880673866719, "timestamp": "2025-10-01 04:13:24.698227", "step": 1789, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.729936", "step": 1789, "epoch": 2 }, { "type": "loss", "content": 0.0029478694777935743, "timestamp": "2025-10-01 04:13:24.732024", "step": 1790, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.762747", "step": 1790, "epoch": 2 }, { "type": "loss", "content": 0.0037737940438091755, "timestamp": "2025-10-01 04:13:24.765005", "step": 1791, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.795681", "step": 1791, "epoch": 2 }, { "type": "loss", "content": 0.006180554162710905, "timestamp": "2025-10-01 04:13:24.820653", "step": 1792, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.851306", "step": 1792, "epoch": 2 }, { "type": "loss", "content": 0.002698136493563652, "timestamp": "2025-10-01 04:13:24.854328", "step": 1793, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:24.886677", "step": 1793, "epoch": 2 }, { "type": "loss", "content": 0.0008915706421248615, "timestamp": "2025-10-01 04:13:24.889168", "step": 1794, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.921713", "step": 1794, "epoch": 2 }, { "type": "loss", "content": 0.0012432237854227424, "timestamp": "2025-10-01 04:13:24.923776", "step": 1795, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:24.954273", "step": 1795, "epoch": 2 }, { "type": "loss", "content": 0.0019791352096945047, "timestamp": "2025-10-01 04:13:24.978167", "step": 1796, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:25.009990", "step": 1796, "epoch": 2 }, { "type": "loss", "content": 0.018590480089187622, "timestamp": "2025-10-01 04:13:25.012037", "step": 1797, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.043127", "step": 1797, "epoch": 2 }, { "type": "loss", "content": 0.037092529237270355, "timestamp": "2025-10-01 04:13:25.045790", "step": 1798, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.077599", "step": 1798, "epoch": 2 }, { "type": "loss", "content": 0.02361510694026947, "timestamp": "2025-10-01 04:13:25.079796", "step": 1799, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.110996", "step": 1799, "epoch": 2 }, { "type": "loss", "content": 0.030858786776661873, "timestamp": "2025-10-01 04:13:25.135008", "step": 1800, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.165488", "step": 1800, "epoch": 2 }, { "type": "loss", "content": 0.0010235338704660535, "timestamp": "2025-10-01 04:13:25.167691", "step": 1801, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.198262", "step": 1801, "epoch": 2 }, { "type": "loss", "content": 0.015298736281692982, "timestamp": "2025-10-01 04:13:25.200240", "step": 1802, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.232740", "step": 1802, "epoch": 2 }, { "type": "loss", "content": 0.001420345390215516, "timestamp": "2025-10-01 04:13:25.234921", "step": 1803, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.265541", "step": 1803, "epoch": 2 }, { "type": "loss", "content": 0.0029548178426921368, "timestamp": "2025-10-01 04:13:25.289429", "step": 1804, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.320826", "step": 1804, "epoch": 2 }, { "type": "loss", "content": 0.01599166728556156, "timestamp": "2025-10-01 04:13:25.323047", "step": 1805, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.354007", "step": 1805, "epoch": 2 }, { "type": "loss", "content": 0.0019125614780932665, "timestamp": "2025-10-01 04:13:25.356149", "step": 1806, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.387220", "step": 1806, "epoch": 2 }, { "type": "loss", "content": 0.022777797654271126, "timestamp": "2025-10-01 04:13:25.389367", "step": 1807, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.424041", "step": 1807, "epoch": 2 }, { "type": "loss", "content": 0.004297844599932432, "timestamp": "2025-10-01 04:13:25.447850", "step": 1808, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.479354", "step": 1808, "epoch": 2 }, { "type": "loss", "content": 0.0009682009113021195, "timestamp": "2025-10-01 04:13:25.481785", "step": 1809, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.512359", "step": 1809, "epoch": 2 }, { "type": "loss", "content": 0.0078015453182160854, "timestamp": "2025-10-01 04:13:25.514773", "step": 1810, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.545490", "step": 1810, "epoch": 2 }, { "type": "loss", "content": 0.008723942562937737, "timestamp": "2025-10-01 04:13:25.547920", "step": 1811, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.578423", "step": 1811, "epoch": 2 }, { "type": "loss", "content": 0.005301562137901783, "timestamp": "2025-10-01 04:13:25.602838", "step": 1812, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.633262", "step": 1812, "epoch": 2 }, { "type": "loss", "content": 0.002418793737888336, "timestamp": "2025-10-01 04:13:25.635792", "step": 1813, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.666735", "step": 1813, "epoch": 2 }, { "type": "loss", "content": 0.0029904316179454327, "timestamp": "2025-10-01 04:13:25.669067", "step": 1814, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.699870", "step": 1814, "epoch": 2 }, { "type": "loss", "content": 0.006601743865758181, "timestamp": "2025-10-01 04:13:25.702131", "step": 1815, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.733411", "step": 1815, "epoch": 2 }, { "type": "loss", "content": 0.0020187620539218187, "timestamp": "2025-10-01 04:13:25.757399", "step": 1816, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:25.788661", "step": 1816, "epoch": 2 }, { "type": "loss", "content": 0.0014762390637770295, "timestamp": "2025-10-01 04:13:25.790650", "step": 1817, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.821422", "step": 1817, "epoch": 2 }, { "type": "loss", "content": 0.035270411521196365, "timestamp": "2025-10-01 04:13:25.823667", "step": 1818, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.855237", "step": 1818, "epoch": 2 }, { "type": "loss", "content": 0.02132105454802513, "timestamp": "2025-10-01 04:13:25.858275", "step": 1819, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.889586", "step": 1819, "epoch": 2 }, { "type": "loss", "content": 0.0026816707104444504, "timestamp": "2025-10-01 04:13:25.913438", "step": 1820, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.944648", "step": 1820, "epoch": 2 }, { "type": "loss", "content": 0.007399698253720999, "timestamp": "2025-10-01 04:13:25.946626", "step": 1821, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:25.977468", "step": 1821, "epoch": 2 }, { "type": "loss", "content": 0.0009930060477927327, "timestamp": "2025-10-01 04:13:25.980770", "step": 1822, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:26.012561", "step": 1822, "epoch": 2 }, { "type": "loss", "content": 0.0034744550939649343, "timestamp": "2025-10-01 04:13:26.015065", "step": 1823, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:26.046539", "step": 1823, "epoch": 2 }, { "type": "loss", "content": 0.008859805762767792, "timestamp": "2025-10-01 04:13:26.073275", "step": 1824, "epoch": 2 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:26.888108", "step": 1824, "epoch": 2 }, { "type": "pplx", "content": 74796042.91379905, "timestamp": "2025-10-01 04:13:26.890423", "step": 1824, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:26.921508", "step": 1824, "epoch": 2 }, { "type": "loss", "content": 0.003556231502443552, "timestamp": "2025-10-01 04:13:26.925792", "step": 1825, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:26.960528", "step": 1825, "epoch": 2 }, { "type": "loss", "content": 0.004978193901479244, "timestamp": "2025-10-01 04:13:26.963397", "step": 1826, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:26.996917", "step": 1826, "epoch": 2 }, { "type": "loss", "content": 0.0016510151326656342, "timestamp": "2025-10-01 04:13:26.999777", "step": 1827, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:27.033159", "step": 1827, "epoch": 2 }, { "type": "loss", "content": 0.007656362373381853, "timestamp": "2025-10-01 04:13:27.056743", "step": 1828, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:27.088814", "step": 1828, "epoch": 2 }, { "type": "loss", "content": 0.0006503559998236597, "timestamp": "2025-10-01 04:13:27.093304", "step": 1829, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.125807", "step": 1829, "epoch": 2 }, { "type": "loss", "content": 0.03609628230333328, "timestamp": "2025-10-01 04:13:27.129373", "step": 1830, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:27.161355", "step": 1830, "epoch": 2 }, { "type": "loss", "content": 0.00043880558223463595, "timestamp": "2025-10-01 04:13:27.164355", "step": 1831, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.197251", "step": 1831, "epoch": 2 }, { "type": "loss", "content": 0.003280007978901267, "timestamp": "2025-10-01 04:13:27.221554", "step": 1832, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.256437", "step": 1832, "epoch": 2 }, { "type": "loss", "content": 0.002217098604887724, "timestamp": "2025-10-01 04:13:27.259061", "step": 1833, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:27.292756", "step": 1833, "epoch": 2 }, { "type": "loss", "content": 0.0035265518818050623, "timestamp": "2025-10-01 04:13:27.295747", "step": 1834, "epoch": 2 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:27.347395", "step": 1834, "epoch": 3 }, { "type": "loss", "content": 0.03178298473358154, "timestamp": "2025-10-01 04:13:27.350825", "step": 1835, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.384450", "step": 1835, "epoch": 3 }, { "type": "loss", "content": 0.023568039759993553, "timestamp": "2025-10-01 04:13:27.409341", "step": 1836, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.445995", "step": 1836, "epoch": 3 }, { "type": "loss", "content": 0.005751292686909437, "timestamp": "2025-10-01 04:13:27.449020", "step": 1837, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.482748", "step": 1837, "epoch": 3 }, { "type": "loss", "content": 0.0012173604918643832, "timestamp": "2025-10-01 04:13:27.485326", "step": 1838, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.518569", "step": 1838, "epoch": 3 }, { "type": "loss", "content": 0.056264422833919525, "timestamp": "2025-10-01 04:13:27.521494", "step": 1839, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.554854", "step": 1839, "epoch": 3 }, { "type": "loss", "content": 0.018358344212174416, "timestamp": "2025-10-01 04:13:27.579895", "step": 1840, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:27.613386", "step": 1840, "epoch": 3 }, { "type": "loss", "content": 0.010695835575461388, "timestamp": "2025-10-01 04:13:27.616027", "step": 1841, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.650847", "step": 1841, "epoch": 3 }, { "type": "loss", "content": 0.0009615811286494136, "timestamp": "2025-10-01 04:13:27.653408", "step": 1842, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.686778", "step": 1842, "epoch": 3 }, { "type": "loss", "content": 0.0005880445241928101, "timestamp": "2025-10-01 04:13:27.689243", "step": 1843, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.722631", "step": 1843, "epoch": 3 }, { "type": "loss", "content": 0.034726135432720184, "timestamp": "2025-10-01 04:13:27.747599", "step": 1844, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.781394", "step": 1844, "epoch": 3 }, { "type": "loss", "content": 0.009501738473773003, "timestamp": "2025-10-01 04:13:27.784021", "step": 1845, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.818136", "step": 1845, "epoch": 3 }, { "type": "loss", "content": 0.001588527811691165, "timestamp": "2025-10-01 04:13:27.820910", "step": 1846, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.855251", "step": 1846, "epoch": 3 }, { "type": "loss", "content": 0.010716503486037254, "timestamp": "2025-10-01 04:13:27.857868", "step": 1847, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.891326", "step": 1847, "epoch": 3 }, { "type": "loss", "content": 0.029663624241948128, "timestamp": "2025-10-01 04:13:27.915703", "step": 1848, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.950094", "step": 1848, "epoch": 3 }, { "type": "loss", "content": 0.013627013191580772, "timestamp": "2025-10-01 04:13:27.952950", "step": 1849, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:27.986386", "step": 1849, "epoch": 3 }, { "type": "loss", "content": 0.014335517771542072, "timestamp": "2025-10-01 04:13:27.989799", "step": 1850, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.021633", "step": 1850, "epoch": 3 }, { "type": "loss", "content": 0.011362132616341114, "timestamp": "2025-10-01 04:13:28.024810", "step": 1851, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.058184", "step": 1851, "epoch": 3 }, { "type": "loss", "content": 0.004223748110234737, "timestamp": "2025-10-01 04:13:28.082747", "step": 1852, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:28.116575", "step": 1852, "epoch": 3 }, { "type": "loss", "content": 0.012491399422287941, "timestamp": "2025-10-01 04:13:28.119353", "step": 1853, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.151658", "step": 1853, "epoch": 3 }, { "type": "loss", "content": 0.013354037888348103, "timestamp": "2025-10-01 04:13:28.153970", "step": 1854, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:28.186118", "step": 1854, "epoch": 3 }, { "type": "loss", "content": 0.021396491676568985, "timestamp": "2025-10-01 04:13:28.194063", "step": 1855, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.225442", "step": 1855, "epoch": 3 }, { "type": "loss", "content": 0.05161554366350174, "timestamp": "2025-10-01 04:13:28.249089", "step": 1856, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.280062", "step": 1856, "epoch": 3 }, { "type": "loss", "content": 0.010022538714110851, "timestamp": "2025-10-01 04:13:28.281987", "step": 1857, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.312848", "step": 1857, "epoch": 3 }, { "type": "loss", "content": 0.011016866192221642, "timestamp": "2025-10-01 04:13:28.315012", "step": 1858, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.345778", "step": 1858, "epoch": 3 }, { "type": "loss", "content": 0.006926259491592646, "timestamp": "2025-10-01 04:13:28.347844", "step": 1859, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:28.379938", "step": 1859, "epoch": 3 }, { "type": "loss", "content": 0.004610821604728699, "timestamp": "2025-10-01 04:13:28.403784", "step": 1860, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:28.436043", "step": 1860, "epoch": 3 }, { "type": "loss", "content": 0.004365658853203058, "timestamp": "2025-10-01 04:13:28.438057", "step": 1861, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.470113", "step": 1861, "epoch": 3 }, { "type": "loss", "content": 0.027243614196777344, "timestamp": "2025-10-01 04:13:28.472186", "step": 1862, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.504620", "step": 1862, "epoch": 3 }, { "type": "loss", "content": 0.0072929286397993565, "timestamp": "2025-10-01 04:13:28.506635", "step": 1863, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.539042", "step": 1863, "epoch": 3 }, { "type": "loss", "content": 0.012515954673290253, "timestamp": "2025-10-01 04:13:28.562938", "step": 1864, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.595735", "step": 1864, "epoch": 3 }, { "type": "loss", "content": 0.011179575696587563, "timestamp": "2025-10-01 04:13:28.597981", "step": 1865, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.630308", "step": 1865, "epoch": 3 }, { "type": "loss", "content": 0.022026440128684044, "timestamp": "2025-10-01 04:13:28.632844", "step": 1866, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.665183", "step": 1866, "epoch": 3 }, { "type": "loss", "content": 0.008464011363685131, "timestamp": "2025-10-01 04:13:28.667271", "step": 1867, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.698879", "step": 1867, "epoch": 3 }, { "type": "loss", "content": 0.018374668434262276, "timestamp": "2025-10-01 04:13:28.722783", "step": 1868, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.755232", "step": 1868, "epoch": 3 }, { "type": "loss", "content": 0.0033123858738690615, "timestamp": "2025-10-01 04:13:28.757207", "step": 1869, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.789810", "step": 1869, "epoch": 3 }, { "type": "loss", "content": 0.01134712714701891, "timestamp": "2025-10-01 04:13:28.791793", "step": 1870, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.823941", "step": 1870, "epoch": 3 }, { "type": "loss", "content": 0.015737758949398994, "timestamp": "2025-10-01 04:13:28.826768", "step": 1871, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.858581", "step": 1871, "epoch": 3 }, { "type": "loss", "content": 0.004202558193355799, "timestamp": "2025-10-01 04:13:28.882595", "step": 1872, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:28.914707", "step": 1872, "epoch": 3 }, { "type": "loss", "content": 0.005341304000467062, "timestamp": "2025-10-01 04:13:28.916798", "step": 1873, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.947280", "step": 1873, "epoch": 3 }, { "type": "loss", "content": 0.010225766338407993, "timestamp": "2025-10-01 04:13:28.949059", "step": 1874, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:28.980790", "step": 1874, "epoch": 3 }, { "type": "loss", "content": 0.007927073165774345, "timestamp": "2025-10-01 04:13:28.983055", "step": 1875, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:29.013392", "step": 1875, "epoch": 3 }, { "type": "loss", "content": 0.015908638015389442, "timestamp": "2025-10-01 04:13:29.037108", "step": 1876, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:29.068314", "step": 1876, "epoch": 3 }, { "type": "loss", "content": 0.010531976819038391, "timestamp": "2025-10-01 04:13:29.070716", "step": 1877, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:29.101504", "step": 1877, "epoch": 3 }, { "type": "loss", "content": 0.0012402069987729192, "timestamp": "2025-10-01 04:13:29.103558", "step": 1878, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:29.135058", "step": 1878, "epoch": 3 }, { "type": "loss", "content": 0.00214685732498765, "timestamp": "2025-10-01 04:13:29.137054", "step": 1879, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:29.167608", "step": 1879, "epoch": 3 }, { "type": "loss", "content": 0.0022061713971197605, "timestamp": "2025-10-01 04:13:29.191987", "step": 1880, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:29.223633", "step": 1880, "epoch": 3 }, { "type": "loss", "content": 0.004665750078856945, "timestamp": "2025-10-01 04:13:29.225559", "step": 1881, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:29.954864", "step": 1881, "epoch": 3 }, { "type": "pplx", "content": 65838218.305152945, "timestamp": "2025-10-01 04:13:29.956724", "step": 1881, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:29.986217", "step": 1881, "epoch": 3 }, { "type": "loss", "content": 0.0013645051512867212, "timestamp": "2025-10-01 04:13:29.988092", "step": 1882, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.018620", "step": 1882, "epoch": 3 }, { "type": "loss", "content": 0.001059826696291566, "timestamp": "2025-10-01 04:13:30.020716", "step": 1883, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.053436", "step": 1883, "epoch": 3 }, { "type": "loss", "content": 0.0016396061982959509, "timestamp": "2025-10-01 04:13:30.077176", "step": 1884, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.108481", "step": 1884, "epoch": 3 }, { "type": "loss", "content": 0.0006029635551385581, "timestamp": "2025-10-01 04:13:30.110611", "step": 1885, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.143587", "step": 1885, "epoch": 3 }, { "type": "loss", "content": 0.03112313151359558, "timestamp": "2025-10-01 04:13:30.145822", "step": 1886, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.177911", "step": 1886, "epoch": 3 }, { "type": "loss", "content": 0.02117547206580639, "timestamp": "2025-10-01 04:13:30.180583", "step": 1887, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:30.213919", "step": 1887, "epoch": 3 }, { "type": "loss", "content": 0.005383083131164312, "timestamp": "2025-10-01 04:13:30.237897", "step": 1888, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.273917", "step": 1888, "epoch": 3 }, { "type": "loss", "content": 0.011658556759357452, "timestamp": "2025-10-01 04:13:30.276449", "step": 1889, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.309107", "step": 1889, "epoch": 3 }, { "type": "loss", "content": 0.0026283669285476208, "timestamp": "2025-10-01 04:13:30.311166", "step": 1890, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.343471", "step": 1890, "epoch": 3 }, { "type": "loss", "content": 0.0016616786597296596, "timestamp": "2025-10-01 04:13:30.345473", "step": 1891, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.376082", "step": 1891, "epoch": 3 }, { "type": "loss", "content": 0.006170527543872595, "timestamp": "2025-10-01 04:13:30.399812", "step": 1892, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.430571", "step": 1892, "epoch": 3 }, { "type": "loss", "content": 0.011967835947871208, "timestamp": "2025-10-01 04:13:30.432581", "step": 1893, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.462710", "step": 1893, "epoch": 3 }, { "type": "loss", "content": 0.03743167966604233, "timestamp": "2025-10-01 04:13:30.464893", "step": 1894, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.496021", "step": 1894, "epoch": 3 }, { "type": "loss", "content": 0.0017973927315324545, "timestamp": "2025-10-01 04:13:30.498004", "step": 1895, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.527694", "step": 1895, "epoch": 3 }, { "type": "loss", "content": 0.0042338259518146515, "timestamp": "2025-10-01 04:13:30.551287", "step": 1896, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.581793", "step": 1896, "epoch": 3 }, { "type": "loss", "content": 0.0005765099194832146, "timestamp": "2025-10-01 04:13:30.583841", "step": 1897, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.614291", "step": 1897, "epoch": 3 }, { "type": "loss", "content": 0.0013105726102367043, "timestamp": "2025-10-01 04:13:30.616201", "step": 1898, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.650389", "step": 1898, "epoch": 3 }, { "type": "loss", "content": 0.0006543918279930949, "timestamp": "2025-10-01 04:13:30.652414", "step": 1899, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.682310", "step": 1899, "epoch": 3 }, { "type": "loss", "content": 0.040685024112463, "timestamp": "2025-10-01 04:13:30.705820", "step": 1900, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.736117", "step": 1900, "epoch": 3 }, { "type": "loss", "content": 0.0019543597009032965, "timestamp": "2025-10-01 04:13:30.741473", "step": 1901, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.772803", "step": 1901, "epoch": 3 }, { "type": "loss", "content": 0.014719049446284771, "timestamp": "2025-10-01 04:13:30.775507", "step": 1902, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.805883", "step": 1902, "epoch": 3 }, { "type": "loss", "content": 0.014800036326050758, "timestamp": "2025-10-01 04:13:30.808051", "step": 1903, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.838300", "step": 1903, "epoch": 3 }, { "type": "loss", "content": 0.03836487978696823, "timestamp": "2025-10-01 04:13:30.866424", "step": 1904, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.897353", "step": 1904, "epoch": 3 }, { "type": "loss", "content": 0.0014941886765882373, "timestamp": "2025-10-01 04:13:30.904541", "step": 1905, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:30.935950", "step": 1905, "epoch": 3 }, { "type": "loss", "content": 0.03499216586351395, "timestamp": "2025-10-01 04:13:30.938392", "step": 1906, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:30.968920", "step": 1906, "epoch": 3 }, { "type": "loss", "content": 0.0013538615312427282, "timestamp": "2025-10-01 04:13:30.974435", "step": 1907, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.011062", "step": 1907, "epoch": 3 }, { "type": "loss", "content": 0.02370210364460945, "timestamp": "2025-10-01 04:13:31.036779", "step": 1908, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.071927", "step": 1908, "epoch": 3 }, { "type": "loss", "content": 0.014159292913973331, "timestamp": "2025-10-01 04:13:31.074016", "step": 1909, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:31.104665", "step": 1909, "epoch": 3 }, { "type": "loss", "content": 0.01698492281138897, "timestamp": "2025-10-01 04:13:31.107265", "step": 1910, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.138415", "step": 1910, "epoch": 3 }, { "type": "loss", "content": 0.013147769495844841, "timestamp": "2025-10-01 04:13:31.140256", "step": 1911, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.170541", "step": 1911, "epoch": 3 }, { "type": "loss", "content": 0.0037520453333854675, "timestamp": "2025-10-01 04:13:31.195782", "step": 1912, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.229118", "step": 1912, "epoch": 3 }, { "type": "loss", "content": 0.015137665905058384, "timestamp": "2025-10-01 04:13:31.231257", "step": 1913, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:31.261872", "step": 1913, "epoch": 3 }, { "type": "loss", "content": 0.00783214159309864, "timestamp": "2025-10-01 04:13:31.264099", "step": 1914, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.294172", "step": 1914, "epoch": 3 }, { "type": "loss", "content": 0.02275925502181053, "timestamp": "2025-10-01 04:13:31.296275", "step": 1915, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.326140", "step": 1915, "epoch": 3 }, { "type": "loss", "content": 0.012978742830455303, "timestamp": "2025-10-01 04:13:31.349911", "step": 1916, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.380199", "step": 1916, "epoch": 3 }, { "type": "loss", "content": 0.0010948796989396214, "timestamp": "2025-10-01 04:13:31.383428", "step": 1917, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.413322", "step": 1917, "epoch": 3 }, { "type": "loss", "content": 0.021318109706044197, "timestamp": "2025-10-01 04:13:31.415399", "step": 1918, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.445402", "step": 1918, "epoch": 3 }, { "type": "loss", "content": 0.005678139626979828, "timestamp": "2025-10-01 04:13:31.447655", "step": 1919, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.477948", "step": 1919, "epoch": 3 }, { "type": "loss", "content": 0.04414077475667, "timestamp": "2025-10-01 04:13:31.501749", "step": 1920, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.532667", "step": 1920, "epoch": 3 }, { "type": "loss", "content": 0.011896932497620583, "timestamp": "2025-10-01 04:13:31.534894", "step": 1921, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.566668", "step": 1921, "epoch": 3 }, { "type": "loss", "content": 0.015560947358608246, "timestamp": "2025-10-01 04:13:31.569044", "step": 1922, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.599157", "step": 1922, "epoch": 3 }, { "type": "loss", "content": 0.04291582852602005, "timestamp": "2025-10-01 04:13:31.601088", "step": 1923, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:31.631716", "step": 1923, "epoch": 3 }, { "type": "loss", "content": 0.02800329215824604, "timestamp": "2025-10-01 04:13:31.655283", "step": 1924, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.686117", "step": 1924, "epoch": 3 }, { "type": "loss", "content": 0.030857792124152184, "timestamp": "2025-10-01 04:13:31.688098", "step": 1925, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:31.717738", "step": 1925, "epoch": 3 }, { "type": "loss", "content": 0.027378613129258156, "timestamp": "2025-10-01 04:13:31.719815", "step": 1926, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.751177", "step": 1926, "epoch": 3 }, { "type": "loss", "content": 0.0037630724254995584, "timestamp": "2025-10-01 04:13:31.753563", "step": 1927, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.784129", "step": 1927, "epoch": 3 }, { "type": "loss", "content": 0.009359322488307953, "timestamp": "2025-10-01 04:13:31.809218", "step": 1928, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.839805", "step": 1928, "epoch": 3 }, { "type": "loss", "content": 0.021776987239718437, "timestamp": "2025-10-01 04:13:31.841711", "step": 1929, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.871655", "step": 1929, "epoch": 3 }, { "type": "loss", "content": 0.04720321670174599, "timestamp": "2025-10-01 04:13:31.873744", "step": 1930, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.903552", "step": 1930, "epoch": 3 }, { "type": "loss", "content": 0.010065094567835331, "timestamp": "2025-10-01 04:13:31.905664", "step": 1931, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.935470", "step": 1931, "epoch": 3 }, { "type": "loss", "content": 0.005961592774838209, "timestamp": "2025-10-01 04:13:31.959241", "step": 1932, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:31.989950", "step": 1932, "epoch": 3 }, { "type": "loss", "content": 0.0008259662427008152, "timestamp": "2025-10-01 04:13:31.992442", "step": 1933, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.022090", "step": 1933, "epoch": 3 }, { "type": "loss", "content": 0.008287528529763222, "timestamp": "2025-10-01 04:13:32.024453", "step": 1934, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.055265", "step": 1934, "epoch": 3 }, { "type": "loss", "content": 0.002223608084022999, "timestamp": "2025-10-01 04:13:32.057659", "step": 1935, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.087501", "step": 1935, "epoch": 3 }, { "type": "loss", "content": 0.0037765316665172577, "timestamp": "2025-10-01 04:13:32.111706", "step": 1936, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.142388", "step": 1936, "epoch": 3 }, { "type": "loss", "content": 0.010634335689246655, "timestamp": "2025-10-01 04:13:32.144384", "step": 1937, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.175259", "step": 1937, "epoch": 3 }, { "type": "loss", "content": 0.008796789683401585, "timestamp": "2025-10-01 04:13:32.177824", "step": 1938, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:32.902030", "step": 1938, "epoch": 3 }, { "type": "pplx", "content": 59471434.40554708, "timestamp": "2025-10-01 04:13:32.903935", "step": 1938, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.933279", "step": 1938, "epoch": 3 }, { "type": "loss", "content": 0.04188920930027962, "timestamp": "2025-10-01 04:13:32.935486", "step": 1939, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:32.965804", "step": 1939, "epoch": 3 }, { "type": "loss", "content": 0.001124311238527298, "timestamp": "2025-10-01 04:13:32.989586", "step": 1940, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.019772", "step": 1940, "epoch": 3 }, { "type": "loss", "content": 0.004740908741950989, "timestamp": "2025-10-01 04:13:33.021948", "step": 1941, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.053776", "step": 1941, "epoch": 3 }, { "type": "loss", "content": 0.015581603161990643, "timestamp": "2025-10-01 04:13:33.056248", "step": 1942, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.085781", "step": 1942, "epoch": 3 }, { "type": "loss", "content": 0.05138517543673515, "timestamp": "2025-10-01 04:13:33.088154", "step": 1943, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:33.119782", "step": 1943, "epoch": 3 }, { "type": "loss", "content": 0.05064978078007698, "timestamp": "2025-10-01 04:13:33.143991", "step": 1944, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.175911", "step": 1944, "epoch": 3 }, { "type": "loss", "content": 0.002924917731434107, "timestamp": "2025-10-01 04:13:33.178647", "step": 1945, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.209408", "step": 1945, "epoch": 3 }, { "type": "loss", "content": 0.008786286227405071, "timestamp": "2025-10-01 04:13:33.211405", "step": 1946, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.241561", "step": 1946, "epoch": 3 }, { "type": "loss", "content": 0.004422423895448446, "timestamp": "2025-10-01 04:13:33.243950", "step": 1947, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.274674", "step": 1947, "epoch": 3 }, { "type": "loss", "content": 0.004494368098676205, "timestamp": "2025-10-01 04:13:33.298784", "step": 1948, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.329977", "step": 1948, "epoch": 3 }, { "type": "loss", "content": 0.002485204953700304, "timestamp": "2025-10-01 04:13:33.332296", "step": 1949, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:33.362428", "step": 1949, "epoch": 3 }, { "type": "loss", "content": 0.018244251608848572, "timestamp": "2025-10-01 04:13:33.364472", "step": 1950, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.396877", "step": 1950, "epoch": 3 }, { "type": "loss", "content": 0.012204146943986416, "timestamp": "2025-10-01 04:13:33.399354", "step": 1951, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.430007", "step": 1951, "epoch": 3 }, { "type": "loss", "content": 0.017980916425585747, "timestamp": "2025-10-01 04:13:33.454051", "step": 1952, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.486631", "step": 1952, "epoch": 3 }, { "type": "loss", "content": 0.0020012136083096266, "timestamp": "2025-10-01 04:13:33.488563", "step": 1953, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.519185", "step": 1953, "epoch": 3 }, { "type": "loss", "content": 0.0037194720935076475, "timestamp": "2025-10-01 04:13:33.521326", "step": 1954, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.554933", "step": 1954, "epoch": 3 }, { "type": "loss", "content": 0.03667493537068367, "timestamp": "2025-10-01 04:13:33.557126", "step": 1955, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:33.589409", "step": 1955, "epoch": 3 }, { "type": "loss", "content": 0.02140049636363983, "timestamp": "2025-10-01 04:13:33.614408", "step": 1956, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:33.645953", "step": 1956, "epoch": 3 }, { "type": "loss", "content": 0.015395239926874638, "timestamp": "2025-10-01 04:13:33.648748", "step": 1957, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.679684", "step": 1957, "epoch": 3 }, { "type": "loss", "content": 0.0056080566719174385, "timestamp": "2025-10-01 04:13:33.681994", "step": 1958, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.714060", "step": 1958, "epoch": 3 }, { "type": "loss", "content": 0.004278783220797777, "timestamp": "2025-10-01 04:13:33.716110", "step": 1959, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.747166", "step": 1959, "epoch": 3 }, { "type": "loss", "content": 0.031007369980216026, "timestamp": "2025-10-01 04:13:33.770944", "step": 1960, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:33.802364", "step": 1960, "epoch": 3 }, { "type": "loss", "content": 0.004366572946310043, "timestamp": "2025-10-01 04:13:33.804460", "step": 1961, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.835489", "step": 1961, "epoch": 3 }, { "type": "loss", "content": 0.003953932784497738, "timestamp": "2025-10-01 04:13:33.837553", "step": 1962, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.869313", "step": 1962, "epoch": 3 }, { "type": "loss", "content": 0.020310308784246445, "timestamp": "2025-10-01 04:13:33.871664", "step": 1963, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.901955", "step": 1963, "epoch": 3 }, { "type": "loss", "content": 0.040750809013843536, "timestamp": "2025-10-01 04:13:33.925390", "step": 1964, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:33.955820", "step": 1964, "epoch": 3 }, { "type": "loss", "content": 0.005061944015324116, "timestamp": "2025-10-01 04:13:33.957668", "step": 1965, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:33.988940", "step": 1965, "epoch": 3 }, { "type": "loss", "content": 0.0036905964370816946, "timestamp": "2025-10-01 04:13:33.991360", "step": 1966, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.021359", "step": 1966, "epoch": 3 }, { "type": "loss", "content": 0.005855924915522337, "timestamp": "2025-10-01 04:13:34.023692", "step": 1967, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.053658", "step": 1967, "epoch": 3 }, { "type": "loss", "content": 0.02116120420396328, "timestamp": "2025-10-01 04:13:34.077414", "step": 1968, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:34.108331", "step": 1968, "epoch": 3 }, { "type": "loss", "content": 0.002757231006398797, "timestamp": "2025-10-01 04:13:34.110196", "step": 1969, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.141899", "step": 1969, "epoch": 3 }, { "type": "loss", "content": 0.011400270275771618, "timestamp": "2025-10-01 04:13:34.144151", "step": 1970, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:34.174519", "step": 1970, "epoch": 3 }, { "type": "loss", "content": 0.011773461475968361, "timestamp": "2025-10-01 04:13:34.176907", "step": 1971, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.207821", "step": 1971, "epoch": 3 }, { "type": "loss", "content": 0.02493475005030632, "timestamp": "2025-10-01 04:13:34.231446", "step": 1972, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.261596", "step": 1972, "epoch": 3 }, { "type": "loss", "content": 0.021315688267350197, "timestamp": "2025-10-01 04:13:34.263901", "step": 1973, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.294275", "step": 1973, "epoch": 3 }, { "type": "loss", "content": 0.008046154864132404, "timestamp": "2025-10-01 04:13:34.296656", "step": 1974, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.326601", "step": 1974, "epoch": 3 }, { "type": "loss", "content": 0.024015789851546288, "timestamp": "2025-10-01 04:13:34.328973", "step": 1975, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.360233", "step": 1975, "epoch": 3 }, { "type": "loss", "content": 0.025987252593040466, "timestamp": "2025-10-01 04:13:34.383785", "step": 1976, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.415222", "step": 1976, "epoch": 3 }, { "type": "loss", "content": 0.008847145363688469, "timestamp": "2025-10-01 04:13:34.417304", "step": 1977, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.447961", "step": 1977, "epoch": 3 }, { "type": "loss", "content": 0.01479099690914154, "timestamp": "2025-10-01 04:13:34.450307", "step": 1978, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.480525", "step": 1978, "epoch": 3 }, { "type": "loss", "content": 0.010528464801609516, "timestamp": "2025-10-01 04:13:34.482802", "step": 1979, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.512384", "step": 1979, "epoch": 3 }, { "type": "loss", "content": 0.0039046481251716614, "timestamp": "2025-10-01 04:13:34.536097", "step": 1980, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.567508", "step": 1980, "epoch": 3 }, { "type": "loss", "content": 0.004746978636831045, "timestamp": "2025-10-01 04:13:34.569515", "step": 1981, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.598932", "step": 1981, "epoch": 3 }, { "type": "loss", "content": 0.011390685103833675, "timestamp": "2025-10-01 04:13:34.601337", "step": 1982, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.631583", "step": 1982, "epoch": 3 }, { "type": "loss", "content": 0.009219239465892315, "timestamp": "2025-10-01 04:13:34.633942", "step": 1983, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.664092", "step": 1983, "epoch": 3 }, { "type": "loss", "content": 0.006982100661844015, "timestamp": "2025-10-01 04:13:34.687609", "step": 1984, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.718286", "step": 1984, "epoch": 3 }, { "type": "loss", "content": 0.0032235754188150167, "timestamp": "2025-10-01 04:13:34.720387", "step": 1985, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.750519", "step": 1985, "epoch": 3 }, { "type": "loss", "content": 0.001993803773075342, "timestamp": "2025-10-01 04:13:34.752547", "step": 1986, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:34.783871", "step": 1986, "epoch": 3 }, { "type": "loss", "content": 0.018395518884062767, "timestamp": "2025-10-01 04:13:34.785819", "step": 1987, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:34.816023", "step": 1987, "epoch": 3 }, { "type": "loss", "content": 0.01245657354593277, "timestamp": "2025-10-01 04:13:34.839668", "step": 1988, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.870737", "step": 1988, "epoch": 3 }, { "type": "loss", "content": 0.014887102879583836, "timestamp": "2025-10-01 04:13:34.872680", "step": 1989, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.902322", "step": 1989, "epoch": 3 }, { "type": "loss", "content": 0.0028198598884046078, "timestamp": "2025-10-01 04:13:34.904112", "step": 1990, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.934024", "step": 1990, "epoch": 3 }, { "type": "loss", "content": 0.04544464498758316, "timestamp": "2025-10-01 04:13:34.936110", "step": 1991, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:34.965363", "step": 1991, "epoch": 3 }, { "type": "loss", "content": 0.0295643862336874, "timestamp": "2025-10-01 04:13:34.988948", "step": 1992, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:35.018979", "step": 1992, "epoch": 3 }, { "type": "loss", "content": 0.05637253448367119, "timestamp": "2025-10-01 04:13:35.021179", "step": 1993, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:35.050950", "step": 1993, "epoch": 3 }, { "type": "loss", "content": 0.015412045642733574, "timestamp": "2025-10-01 04:13:35.052952", "step": 1994, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:35.083055", "step": 1994, "epoch": 3 }, { "type": "loss", "content": 0.0036210103426128626, "timestamp": "2025-10-01 04:13:35.085376", "step": 1995, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:35.884810", "step": 1995, "epoch": 3 }, { "type": "pplx", "content": 45753030.53963223, "timestamp": "2025-10-01 04:13:35.886915", "step": 1995, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:35.917191", "step": 1995, "epoch": 3 }, { "type": "loss", "content": 0.005879095755517483, "timestamp": "2025-10-01 04:13:35.941385", "step": 1996, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:35.973474", "step": 1996, "epoch": 3 }, { "type": "loss", "content": 0.005743040703237057, "timestamp": "2025-10-01 04:13:35.976332", "step": 1997, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:36.009613", "step": 1997, "epoch": 3 }, { "type": "loss", "content": 0.016395164653658867, "timestamp": "2025-10-01 04:13:36.011895", "step": 1998, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:36.043705", "step": 1998, "epoch": 3 }, { "type": "loss", "content": 0.02659083716571331, "timestamp": "2025-10-01 04:13:36.046226", "step": 1999, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:36.078620", "step": 1999, "epoch": 3 }, { "type": "loss", "content": 0.0178259015083313, "timestamp": "2025-10-01 04:13:36.102703", "step": 2000, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2000", "timestamp": "2025-10-01 04:13:41.044832", "step": 2000, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.084808", "step": 2000, "epoch": 3 }, { "type": "loss", "content": 0.00914577953517437, "timestamp": "2025-10-01 04:13:41.087024", "step": 2001, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.117081", "step": 2001, "epoch": 3 }, { "type": "loss", "content": 0.0025305438321083784, "timestamp": "2025-10-01 04:13:41.118808", "step": 2002, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.148903", "step": 2002, "epoch": 3 }, { "type": "loss", "content": 0.0016962222289294004, "timestamp": "2025-10-01 04:13:41.150818", "step": 2003, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.179960", "step": 2003, "epoch": 3 }, { "type": "loss", "content": 0.038361646234989166, "timestamp": "2025-10-01 04:13:41.203542", "step": 2004, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.236584", "step": 2004, "epoch": 3 }, { "type": "loss", "content": 0.0017910029273480177, "timestamp": "2025-10-01 04:13:41.238498", "step": 2005, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:41.267895", "step": 2005, "epoch": 3 }, { "type": "loss", "content": 0.029518509283661842, "timestamp": "2025-10-01 04:13:41.269941", "step": 2006, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.299241", "step": 2006, "epoch": 3 }, { "type": "loss", "content": 0.008002429269254208, "timestamp": "2025-10-01 04:13:41.301296", "step": 2007, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.331410", "step": 2007, "epoch": 3 }, { "type": "loss", "content": 0.020128855481743813, "timestamp": "2025-10-01 04:13:41.354945", "step": 2008, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.385301", "step": 2008, "epoch": 3 }, { "type": "loss", "content": 0.00392829580232501, "timestamp": "2025-10-01 04:13:41.387203", "step": 2009, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:41.416637", "step": 2009, "epoch": 3 }, { "type": "loss", "content": 0.02912677265703678, "timestamp": "2025-10-01 04:13:41.418659", "step": 2010, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.449061", "step": 2010, "epoch": 3 }, { "type": "loss", "content": 0.00468469737097621, "timestamp": "2025-10-01 04:13:41.451445", "step": 2011, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.481111", "step": 2011, "epoch": 3 }, { "type": "loss", "content": 0.00664009153842926, "timestamp": "2025-10-01 04:13:41.504833", "step": 2012, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.534186", "step": 2012, "epoch": 3 }, { "type": "loss", "content": 0.002807890996336937, "timestamp": "2025-10-01 04:13:41.536205", "step": 2013, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.565962", "step": 2013, "epoch": 3 }, { "type": "loss", "content": 0.004763583652675152, "timestamp": "2025-10-01 04:13:41.568519", "step": 2014, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.599712", "step": 2014, "epoch": 3 }, { "type": "loss", "content": 0.022498924285173416, "timestamp": "2025-10-01 04:13:41.601428", "step": 2015, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.631711", "step": 2015, "epoch": 3 }, { "type": "loss", "content": 0.01837407425045967, "timestamp": "2025-10-01 04:13:41.655443", "step": 2016, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.685764", "step": 2016, "epoch": 3 }, { "type": "loss", "content": 0.007270206697285175, "timestamp": "2025-10-01 04:13:41.688059", "step": 2017, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.719301", "step": 2017, "epoch": 3 }, { "type": "loss", "content": 0.012231186963617802, "timestamp": "2025-10-01 04:13:41.721837", "step": 2018, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:41.752069", "step": 2018, "epoch": 3 }, { "type": "loss", "content": 0.003825768129900098, "timestamp": "2025-10-01 04:13:41.754243", "step": 2019, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.784997", "step": 2019, "epoch": 3 }, { "type": "loss", "content": 0.0063703921623528, "timestamp": "2025-10-01 04:13:41.808597", "step": 2020, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.838744", "step": 2020, "epoch": 3 }, { "type": "loss", "content": 0.004631161689758301, "timestamp": "2025-10-01 04:13:41.843697", "step": 2021, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.880007", "step": 2021, "epoch": 3 }, { "type": "loss", "content": 0.005757237318903208, "timestamp": "2025-10-01 04:13:41.882171", "step": 2022, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:41.914333", "step": 2022, "epoch": 3 }, { "type": "loss", "content": 0.002572003984823823, "timestamp": "2025-10-01 04:13:41.917093", "step": 2023, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:41.949578", "step": 2023, "epoch": 3 }, { "type": "loss", "content": 0.01088919211179018, "timestamp": "2025-10-01 04:13:41.973474", "step": 2024, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.006547", "step": 2024, "epoch": 3 }, { "type": "loss", "content": 0.017445290461182594, "timestamp": "2025-10-01 04:13:42.008041", "step": 2025, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:42.044072", "step": 2025, "epoch": 3 }, { "type": "loss", "content": 0.002059668768197298, "timestamp": "2025-10-01 04:13:42.046283", "step": 2026, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.078986", "step": 2026, "epoch": 3 }, { "type": "loss", "content": 0.004977476317435503, "timestamp": "2025-10-01 04:13:42.081441", "step": 2027, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.117496", "step": 2027, "epoch": 3 }, { "type": "loss", "content": 0.011412362568080425, "timestamp": "2025-10-01 04:13:42.141905", "step": 2028, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.175179", "step": 2028, "epoch": 3 }, { "type": "loss", "content": 0.038777027279138565, "timestamp": "2025-10-01 04:13:42.177100", "step": 2029, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.209310", "step": 2029, "epoch": 3 }, { "type": "loss", "content": 0.01822769083082676, "timestamp": "2025-10-01 04:13:42.211295", "step": 2030, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.243929", "step": 2030, "epoch": 3 }, { "type": "loss", "content": 0.003951018210500479, "timestamp": "2025-10-01 04:13:42.246866", "step": 2031, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.277135", "step": 2031, "epoch": 3 }, { "type": "loss", "content": 0.010680814273655415, "timestamp": "2025-10-01 04:13:42.300961", "step": 2032, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:42.331895", "step": 2032, "epoch": 3 }, { "type": "loss", "content": 0.012020791880786419, "timestamp": "2025-10-01 04:13:42.334238", "step": 2033, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.364424", "step": 2033, "epoch": 3 }, { "type": "loss", "content": 0.014839199371635914, "timestamp": "2025-10-01 04:13:42.368089", "step": 2034, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.400671", "step": 2034, "epoch": 3 }, { "type": "loss", "content": 0.004194910638034344, "timestamp": "2025-10-01 04:13:42.403322", "step": 2035, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.433814", "step": 2035, "epoch": 3 }, { "type": "loss", "content": 0.020307643339037895, "timestamp": "2025-10-01 04:13:42.457841", "step": 2036, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.488434", "step": 2036, "epoch": 3 }, { "type": "loss", "content": 0.012298102490603924, "timestamp": "2025-10-01 04:13:42.490738", "step": 2037, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:42.520715", "step": 2037, "epoch": 3 }, { "type": "loss", "content": 0.023398157209157944, "timestamp": "2025-10-01 04:13:42.523148", "step": 2038, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:42.554800", "step": 2038, "epoch": 3 }, { "type": "loss", "content": 0.0030963234603405, "timestamp": "2025-10-01 04:13:42.557395", "step": 2039, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.587791", "step": 2039, "epoch": 3 }, { "type": "loss", "content": 0.0033669748809188604, "timestamp": "2025-10-01 04:13:42.611517", "step": 2040, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.641218", "step": 2040, "epoch": 3 }, { "type": "loss", "content": 0.010291491635143757, "timestamp": "2025-10-01 04:13:42.643439", "step": 2041, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.674332", "step": 2041, "epoch": 3 }, { "type": "loss", "content": 0.001421350403688848, "timestamp": "2025-10-01 04:13:42.676645", "step": 2042, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.706673", "step": 2042, "epoch": 3 }, { "type": "loss", "content": 0.010278047993779182, "timestamp": "2025-10-01 04:13:42.709041", "step": 2043, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.738388", "step": 2043, "epoch": 3 }, { "type": "loss", "content": 0.015349557623267174, "timestamp": "2025-10-01 04:13:42.762173", "step": 2044, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.792206", "step": 2044, "epoch": 3 }, { "type": "loss", "content": 0.003486798843368888, "timestamp": "2025-10-01 04:13:42.794905", "step": 2045, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.824916", "step": 2045, "epoch": 3 }, { "type": "loss", "content": 0.05607367306947708, "timestamp": "2025-10-01 04:13:42.827353", "step": 2046, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.857918", "step": 2046, "epoch": 3 }, { "type": "loss", "content": 0.011063175275921822, "timestamp": "2025-10-01 04:13:42.865769", "step": 2047, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.895703", "step": 2047, "epoch": 3 }, { "type": "loss", "content": 0.0091552147641778, "timestamp": "2025-10-01 04:13:42.919950", "step": 2048, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.949538", "step": 2048, "epoch": 3 }, { "type": "loss", "content": 0.002595100784674287, "timestamp": "2025-10-01 04:13:42.951759", "step": 2049, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:42.981956", "step": 2049, "epoch": 3 }, { "type": "loss", "content": 0.009335712529718876, "timestamp": "2025-10-01 04:13:42.984094", "step": 2050, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:43.015008", "step": 2050, "epoch": 3 }, { "type": "loss", "content": 0.0054068006575107574, "timestamp": "2025-10-01 04:13:43.017493", "step": 2051, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:43.048016", "step": 2051, "epoch": 3 }, { "type": "loss", "content": 0.037687864154577255, "timestamp": "2025-10-01 04:13:43.072541", "step": 2052, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:43.824169", "step": 2052, "epoch": 3 }, { "type": "pplx", "content": 49795955.38740884, "timestamp": "2025-10-01 04:13:43.825908", "step": 2052, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:43.854213", "step": 2052, "epoch": 3 }, { "type": "loss", "content": 0.003781549632549286, "timestamp": "2025-10-01 04:13:43.856335", "step": 2053, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:43.886600", "step": 2053, "epoch": 3 }, { "type": "loss", "content": 0.006803255993872881, "timestamp": "2025-10-01 04:13:43.888530", "step": 2054, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:43.918473", "step": 2054, "epoch": 3 }, { "type": "loss", "content": 0.009120832197368145, "timestamp": "2025-10-01 04:13:43.921096", "step": 2055, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:43.950353", "step": 2055, "epoch": 3 }, { "type": "loss", "content": 0.002633993746712804, "timestamp": "2025-10-01 04:13:43.974505", "step": 2056, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.004236", "step": 2056, "epoch": 3 }, { "type": "loss", "content": 0.016486208885908127, "timestamp": "2025-10-01 04:13:44.006194", "step": 2057, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.036241", "step": 2057, "epoch": 3 }, { "type": "loss", "content": 0.005618637893348932, "timestamp": "2025-10-01 04:13:44.038098", "step": 2058, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.068300", "step": 2058, "epoch": 3 }, { "type": "loss", "content": 0.007397721987217665, "timestamp": "2025-10-01 04:13:44.070489", "step": 2059, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.100564", "step": 2059, "epoch": 3 }, { "type": "loss", "content": 0.005749840755015612, "timestamp": "2025-10-01 04:13:44.124490", "step": 2060, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.156311", "step": 2060, "epoch": 3 }, { "type": "loss", "content": 0.0069750151596963406, "timestamp": "2025-10-01 04:13:44.159010", "step": 2061, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:44.190939", "step": 2061, "epoch": 3 }, { "type": "loss", "content": 0.001430398435331881, "timestamp": "2025-10-01 04:13:44.193187", "step": 2062, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:44.223096", "step": 2062, "epoch": 3 }, { "type": "loss", "content": 0.017816442996263504, "timestamp": "2025-10-01 04:13:44.225709", "step": 2063, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.257661", "step": 2063, "epoch": 3 }, { "type": "loss", "content": 0.010645156726241112, "timestamp": "2025-10-01 04:13:44.281236", "step": 2064, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.312333", "step": 2064, "epoch": 3 }, { "type": "loss", "content": 0.001670172088779509, "timestamp": "2025-10-01 04:13:44.314198", "step": 2065, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.343736", "step": 2065, "epoch": 3 }, { "type": "loss", "content": 0.005457701627165079, "timestamp": "2025-10-01 04:13:44.346086", "step": 2066, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:44.378394", "step": 2066, "epoch": 3 }, { "type": "loss", "content": 0.01062505878508091, "timestamp": "2025-10-01 04:13:44.380558", "step": 2067, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.411292", "step": 2067, "epoch": 3 }, { "type": "loss", "content": 0.026712706312537193, "timestamp": "2025-10-01 04:13:44.435464", "step": 2068, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.466555", "step": 2068, "epoch": 3 }, { "type": "loss", "content": 0.0025216001085937023, "timestamp": "2025-10-01 04:13:44.468527", "step": 2069, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.498443", "step": 2069, "epoch": 3 }, { "type": "loss", "content": 0.01525406539440155, "timestamp": "2025-10-01 04:13:44.500631", "step": 2070, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:44.530696", "step": 2070, "epoch": 3 }, { "type": "loss", "content": 0.0010936484904959798, "timestamp": "2025-10-01 04:13:44.532965", "step": 2071, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.563112", "step": 2071, "epoch": 3 }, { "type": "loss", "content": 0.031001046299934387, "timestamp": "2025-10-01 04:13:44.587137", "step": 2072, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.618608", "step": 2072, "epoch": 3 }, { "type": "loss", "content": 0.009324406273663044, "timestamp": "2025-10-01 04:13:44.621026", "step": 2073, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.650867", "step": 2073, "epoch": 3 }, { "type": "loss", "content": 0.00281874043866992, "timestamp": "2025-10-01 04:13:44.653183", "step": 2074, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:44.685662", "step": 2074, "epoch": 3 }, { "type": "loss", "content": 0.0038216691464185715, "timestamp": "2025-10-01 04:13:44.688020", "step": 2075, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.721222", "step": 2075, "epoch": 3 }, { "type": "loss", "content": 0.0112691018730402, "timestamp": "2025-10-01 04:13:44.745417", "step": 2076, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.780566", "step": 2076, "epoch": 3 }, { "type": "loss", "content": 0.004401656799018383, "timestamp": "2025-10-01 04:13:44.782824", "step": 2077, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:44.814464", "step": 2077, "epoch": 3 }, { "type": "loss", "content": 0.015684420242905617, "timestamp": "2025-10-01 04:13:44.816835", "step": 2078, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.847840", "step": 2078, "epoch": 3 }, { "type": "loss", "content": 0.01610006019473076, "timestamp": "2025-10-01 04:13:44.850063", "step": 2079, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.880699", "step": 2079, "epoch": 3 }, { "type": "loss", "content": 0.00259937415830791, "timestamp": "2025-10-01 04:13:44.904664", "step": 2080, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:44.934179", "step": 2080, "epoch": 3 }, { "type": "loss", "content": 0.006803158205002546, "timestamp": "2025-10-01 04:13:44.936653", "step": 2081, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.966449", "step": 2081, "epoch": 3 }, { "type": "loss", "content": 0.012734351679682732, "timestamp": "2025-10-01 04:13:44.969008", "step": 2082, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:44.999155", "step": 2082, "epoch": 3 }, { "type": "loss", "content": 0.005645200610160828, "timestamp": "2025-10-01 04:13:45.002756", "step": 2083, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.032700", "step": 2083, "epoch": 3 }, { "type": "loss", "content": 0.002019307343289256, "timestamp": "2025-10-01 04:13:45.056534", "step": 2084, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.086792", "step": 2084, "epoch": 3 }, { "type": "loss", "content": 0.04551689699292183, "timestamp": "2025-10-01 04:13:45.089082", "step": 2085, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.124508", "step": 2085, "epoch": 3 }, { "type": "loss", "content": 0.012857954017817974, "timestamp": "2025-10-01 04:13:45.126591", "step": 2086, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.156515", "step": 2086, "epoch": 3 }, { "type": "loss", "content": 0.006605700124055147, "timestamp": "2025-10-01 04:13:45.159999", "step": 2087, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.190392", "step": 2087, "epoch": 3 }, { "type": "loss", "content": 0.008717053569853306, "timestamp": "2025-10-01 04:13:45.214408", "step": 2088, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.244771", "step": 2088, "epoch": 3 }, { "type": "loss", "content": 0.0019231383921578526, "timestamp": "2025-10-01 04:13:45.247092", "step": 2089, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:45.278038", "step": 2089, "epoch": 3 }, { "type": "loss", "content": 0.0013605657732114196, "timestamp": "2025-10-01 04:13:45.281072", "step": 2090, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.310793", "step": 2090, "epoch": 3 }, { "type": "loss", "content": 0.0019781345035880804, "timestamp": "2025-10-01 04:13:45.313165", "step": 2091, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.343456", "step": 2091, "epoch": 3 }, { "type": "loss", "content": 0.011783729307353497, "timestamp": "2025-10-01 04:13:45.367477", "step": 2092, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.396984", "step": 2092, "epoch": 3 }, { "type": "loss", "content": 0.0034526768140494823, "timestamp": "2025-10-01 04:13:45.401775", "step": 2093, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.433859", "step": 2093, "epoch": 3 }, { "type": "loss", "content": 0.012783399783074856, "timestamp": "2025-10-01 04:13:45.436583", "step": 2094, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.467633", "step": 2094, "epoch": 3 }, { "type": "loss", "content": 0.003527685534209013, "timestamp": "2025-10-01 04:13:45.470766", "step": 2095, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.503424", "step": 2095, "epoch": 3 }, { "type": "loss", "content": 0.033071406185626984, "timestamp": "2025-10-01 04:13:45.527184", "step": 2096, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:45.559430", "step": 2096, "epoch": 3 }, { "type": "loss", "content": 0.0020220919977873564, "timestamp": "2025-10-01 04:13:45.562264", "step": 2097, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:45.595810", "step": 2097, "epoch": 3 }, { "type": "loss", "content": 0.021835412830114365, "timestamp": "2025-10-01 04:13:45.598911", "step": 2098, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.630928", "step": 2098, "epoch": 3 }, { "type": "loss", "content": 0.014858120121061802, "timestamp": "2025-10-01 04:13:45.634070", "step": 2099, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.666235", "step": 2099, "epoch": 3 }, { "type": "loss", "content": 0.006137244403362274, "timestamp": "2025-10-01 04:13:45.690758", "step": 2100, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.721698", "step": 2100, "epoch": 3 }, { "type": "loss", "content": 0.001460201689042151, "timestamp": "2025-10-01 04:13:45.724550", "step": 2101, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:45.756040", "step": 2101, "epoch": 3 }, { "type": "loss", "content": 0.0022680035326629877, "timestamp": "2025-10-01 04:13:45.758784", "step": 2102, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:45.791364", "step": 2102, "epoch": 3 }, { "type": "loss", "content": 0.015940619632601738, "timestamp": "2025-10-01 04:13:45.794323", "step": 2103, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.826211", "step": 2103, "epoch": 3 }, { "type": "loss", "content": 0.001429266412742436, "timestamp": "2025-10-01 04:13:45.850814", "step": 2104, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.883253", "step": 2104, "epoch": 3 }, { "type": "loss", "content": 0.0032171185594052076, "timestamp": "2025-10-01 04:13:45.886173", "step": 2105, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:45.918341", "step": 2105, "epoch": 3 }, { "type": "loss", "content": 0.004030963871628046, "timestamp": "2025-10-01 04:13:45.921093", "step": 2106, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:45.954527", "step": 2106, "epoch": 3 }, { "type": "loss", "content": 0.005968966521322727, "timestamp": "2025-10-01 04:13:45.957054", "step": 2107, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:45.991918", "step": 2107, "epoch": 3 }, { "type": "loss", "content": 0.009835876524448395, "timestamp": "2025-10-01 04:13:46.015902", "step": 2108, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:46.049093", "step": 2108, "epoch": 3 }, { "type": "loss", "content": 0.008758926764130592, "timestamp": "2025-10-01 04:13:46.051296", "step": 2109, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:46.863488", "step": 2109, "epoch": 3 }, { "type": "pplx", "content": 48746742.06245831, "timestamp": "2025-10-01 04:13:46.866008", "step": 2109, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:46.896272", "step": 2109, "epoch": 3 }, { "type": "loss", "content": 0.0032065026462078094, "timestamp": "2025-10-01 04:13:46.899890", "step": 2110, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:46.933939", "step": 2110, "epoch": 3 }, { "type": "loss", "content": 0.00044523560791276395, "timestamp": "2025-10-01 04:13:46.937038", "step": 2111, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:46.968805", "step": 2111, "epoch": 3 }, { "type": "loss", "content": 0.01161577831953764, "timestamp": "2025-10-01 04:13:46.993753", "step": 2112, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.025918", "step": 2112, "epoch": 3 }, { "type": "loss", "content": 0.004290744196623564, "timestamp": "2025-10-01 04:13:47.028884", "step": 2113, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.062758", "step": 2113, "epoch": 3 }, { "type": "loss", "content": 0.022032011300325394, "timestamp": "2025-10-01 04:13:47.065644", "step": 2114, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.097294", "step": 2114, "epoch": 3 }, { "type": "loss", "content": 0.0017666907515376806, "timestamp": "2025-10-01 04:13:47.100008", "step": 2115, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.131075", "step": 2115, "epoch": 3 }, { "type": "loss", "content": 0.003643403295427561, "timestamp": "2025-10-01 04:13:47.155465", "step": 2116, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.187837", "step": 2116, "epoch": 3 }, { "type": "loss", "content": 0.003369935555383563, "timestamp": "2025-10-01 04:13:47.190500", "step": 2117, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.222947", "step": 2117, "epoch": 3 }, { "type": "loss", "content": 0.03525031730532646, "timestamp": "2025-10-01 04:13:47.226235", "step": 2118, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.257844", "step": 2118, "epoch": 3 }, { "type": "loss", "content": 0.0019307138863950968, "timestamp": "2025-10-01 04:13:47.262778", "step": 2119, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.294941", "step": 2119, "epoch": 3 }, { "type": "loss", "content": 0.0005271370173431933, "timestamp": "2025-10-01 04:13:47.319366", "step": 2120, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:47.350854", "step": 2120, "epoch": 3 }, { "type": "loss", "content": 0.0013889552792534232, "timestamp": "2025-10-01 04:13:47.354098", "step": 2121, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.388632", "step": 2121, "epoch": 3 }, { "type": "loss", "content": 0.03613368794322014, "timestamp": "2025-10-01 04:13:47.391371", "step": 2122, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.422908", "step": 2122, "epoch": 3 }, { "type": "loss", "content": 0.02574986405670643, "timestamp": "2025-10-01 04:13:47.424925", "step": 2123, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.454375", "step": 2123, "epoch": 3 }, { "type": "loss", "content": 0.0014076424995437264, "timestamp": "2025-10-01 04:13:47.478253", "step": 2124, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.508211", "step": 2124, "epoch": 3 }, { "type": "loss", "content": 0.005204681307077408, "timestamp": "2025-10-01 04:13:47.510895", "step": 2125, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:47.541858", "step": 2125, "epoch": 3 }, { "type": "loss", "content": 0.005410588346421719, "timestamp": "2025-10-01 04:13:47.544363", "step": 2126, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.574928", "step": 2126, "epoch": 3 }, { "type": "loss", "content": 0.003872995963320136, "timestamp": "2025-10-01 04:13:47.577864", "step": 2127, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.607857", "step": 2127, "epoch": 3 }, { "type": "loss", "content": 0.016898931935429573, "timestamp": "2025-10-01 04:13:47.631574", "step": 2128, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.661728", "step": 2128, "epoch": 3 }, { "type": "loss", "content": 0.009217137470841408, "timestamp": "2025-10-01 04:13:47.664119", "step": 2129, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.694288", "step": 2129, "epoch": 3 }, { "type": "loss", "content": 0.002619354287162423, "timestamp": "2025-10-01 04:13:47.697060", "step": 2130, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.726556", "step": 2130, "epoch": 3 }, { "type": "loss", "content": 0.0029307217337191105, "timestamp": "2025-10-01 04:13:47.728790", "step": 2131, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.758161", "step": 2131, "epoch": 3 }, { "type": "loss", "content": 0.008102916181087494, "timestamp": "2025-10-01 04:13:47.783415", "step": 2132, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.819274", "step": 2132, "epoch": 3 }, { "type": "loss", "content": 0.02566385827958584, "timestamp": "2025-10-01 04:13:47.822776", "step": 2133, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:47.852589", "step": 2133, "epoch": 3 }, { "type": "loss", "content": 0.001259711803868413, "timestamp": "2025-10-01 04:13:47.854906", "step": 2134, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.885246", "step": 2134, "epoch": 3 }, { "type": "loss", "content": 0.0018119417363777757, "timestamp": "2025-10-01 04:13:47.887927", "step": 2135, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:47.920539", "step": 2135, "epoch": 3 }, { "type": "loss", "content": 0.0007317255367524922, "timestamp": "2025-10-01 04:13:47.944256", "step": 2136, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:47.975686", "step": 2136, "epoch": 3 }, { "type": "loss", "content": 0.003940492402762175, "timestamp": "2025-10-01 04:13:47.977981", "step": 2137, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.009006", "step": 2137, "epoch": 3 }, { "type": "loss", "content": 0.006948811002075672, "timestamp": "2025-10-01 04:13:48.011279", "step": 2138, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.042415", "step": 2138, "epoch": 3 }, { "type": "loss", "content": 0.0008612987585365772, "timestamp": "2025-10-01 04:13:48.044758", "step": 2139, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.075121", "step": 2139, "epoch": 3 }, { "type": "loss", "content": 0.002451820531859994, "timestamp": "2025-10-01 04:13:48.099113", "step": 2140, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.129880", "step": 2140, "epoch": 3 }, { "type": "loss", "content": 0.0021910052746534348, "timestamp": "2025-10-01 04:13:48.132202", "step": 2141, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.162599", "step": 2141, "epoch": 3 }, { "type": "loss", "content": 0.0032072842586785555, "timestamp": "2025-10-01 04:13:48.165034", "step": 2142, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.196290", "step": 2142, "epoch": 3 }, { "type": "loss", "content": 0.0010801416356116533, "timestamp": "2025-10-01 04:13:48.199115", "step": 2143, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.229594", "step": 2143, "epoch": 3 }, { "type": "loss", "content": 0.0031351468060165644, "timestamp": "2025-10-01 04:13:48.253294", "step": 2144, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.283374", "step": 2144, "epoch": 3 }, { "type": "loss", "content": 0.0016169316368177533, "timestamp": "2025-10-01 04:13:48.285652", "step": 2145, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.315534", "step": 2145, "epoch": 3 }, { "type": "loss", "content": 0.000727647275198251, "timestamp": "2025-10-01 04:13:48.317900", "step": 2146, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.349058", "step": 2146, "epoch": 3 }, { "type": "loss", "content": 0.010667803697288036, "timestamp": "2025-10-01 04:13:48.351754", "step": 2147, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.381734", "step": 2147, "epoch": 3 }, { "type": "loss", "content": 0.0024543162435293198, "timestamp": "2025-10-01 04:13:48.405543", "step": 2148, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.436106", "step": 2148, "epoch": 3 }, { "type": "loss", "content": 0.004013399593532085, "timestamp": "2025-10-01 04:13:48.438645", "step": 2149, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.468985", "step": 2149, "epoch": 3 }, { "type": "loss", "content": 0.00043969464604742825, "timestamp": "2025-10-01 04:13:48.471292", "step": 2150, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.501856", "step": 2150, "epoch": 3 }, { "type": "loss", "content": 0.012558751739561558, "timestamp": "2025-10-01 04:13:48.504957", "step": 2151, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:48.535528", "step": 2151, "epoch": 3 }, { "type": "loss", "content": 0.003782300977036357, "timestamp": "2025-10-01 04:13:48.560637", "step": 2152, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.591438", "step": 2152, "epoch": 3 }, { "type": "loss", "content": 0.0036086090840399265, "timestamp": "2025-10-01 04:13:48.593827", "step": 2153, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:48.623692", "step": 2153, "epoch": 3 }, { "type": "loss", "content": 0.0017416387563571334, "timestamp": "2025-10-01 04:13:48.626270", "step": 2154, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:48.657538", "step": 2154, "epoch": 3 }, { "type": "loss", "content": 0.002613009186461568, "timestamp": "2025-10-01 04:13:48.660086", "step": 2155, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.691023", "step": 2155, "epoch": 3 }, { "type": "loss", "content": 0.001628889818675816, "timestamp": "2025-10-01 04:13:48.714753", "step": 2156, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.744799", "step": 2156, "epoch": 3 }, { "type": "loss", "content": 0.004056987352669239, "timestamp": "2025-10-01 04:13:48.746853", "step": 2157, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:48.776634", "step": 2157, "epoch": 3 }, { "type": "loss", "content": 0.000895503384526819, "timestamp": "2025-10-01 04:13:48.779343", "step": 2158, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:48.810105", "step": 2158, "epoch": 3 }, { "type": "loss", "content": 0.003871302818879485, "timestamp": "2025-10-01 04:13:48.812770", "step": 2159, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.842931", "step": 2159, "epoch": 3 }, { "type": "loss", "content": 0.0022862900514155626, "timestamp": "2025-10-01 04:13:48.866729", "step": 2160, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.896942", "step": 2160, "epoch": 3 }, { "type": "loss", "content": 0.007540857885032892, "timestamp": "2025-10-01 04:13:48.899302", "step": 2161, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:48.930076", "step": 2161, "epoch": 3 }, { "type": "loss", "content": 0.02161705680191517, "timestamp": "2025-10-01 04:13:48.932998", "step": 2162, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:48.963775", "step": 2162, "epoch": 3 }, { "type": "loss", "content": 0.011639682576060295, "timestamp": "2025-10-01 04:13:48.966183", "step": 2163, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:48.996938", "step": 2163, "epoch": 3 }, { "type": "loss", "content": 0.005038486327975988, "timestamp": "2025-10-01 04:13:49.020652", "step": 2164, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:49.052583", "step": 2164, "epoch": 3 }, { "type": "loss", "content": 0.0005112849758006632, "timestamp": "2025-10-01 04:13:49.054967", "step": 2165, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:49.085636", "step": 2165, "epoch": 3 }, { "type": "loss", "content": 0.0019404730992391706, "timestamp": "2025-10-01 04:13:49.088156", "step": 2166, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:49.839197", "step": 2166, "epoch": 3 }, { "type": "pplx", "content": 52580040.12786619, "timestamp": "2025-10-01 04:13:49.841341", "step": 2166, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:49.871723", "step": 2166, "epoch": 3 }, { "type": "loss", "content": 0.021567419171333313, "timestamp": "2025-10-01 04:13:49.874363", "step": 2167, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:49.905190", "step": 2167, "epoch": 3 }, { "type": "loss", "content": 0.0038097966462373734, "timestamp": "2025-10-01 04:13:49.929270", "step": 2168, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:49.959826", "step": 2168, "epoch": 3 }, { "type": "loss", "content": 0.001439273008145392, "timestamp": "2025-10-01 04:13:49.962113", "step": 2169, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:49.993800", "step": 2169, "epoch": 3 }, { "type": "loss", "content": 0.0005533623043447733, "timestamp": "2025-10-01 04:13:49.996628", "step": 2170, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.028080", "step": 2170, "epoch": 3 }, { "type": "loss", "content": 0.0012496901908889413, "timestamp": "2025-10-01 04:13:50.030672", "step": 2171, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.061894", "step": 2171, "epoch": 3 }, { "type": "loss", "content": 0.0007430757395923138, "timestamp": "2025-10-01 04:13:50.090651", "step": 2172, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.130287", "step": 2172, "epoch": 3 }, { "type": "loss", "content": 0.0024324983824044466, "timestamp": "2025-10-01 04:13:50.132440", "step": 2173, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.162682", "step": 2173, "epoch": 3 }, { "type": "loss", "content": 0.015526972711086273, "timestamp": "2025-10-01 04:13:50.165032", "step": 2174, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:50.196207", "step": 2174, "epoch": 3 }, { "type": "loss", "content": 0.003723499597981572, "timestamp": "2025-10-01 04:13:50.200706", "step": 2175, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.232248", "step": 2175, "epoch": 3 }, { "type": "loss", "content": 0.0009116663713939488, "timestamp": "2025-10-01 04:13:50.256033", "step": 2176, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.287948", "step": 2176, "epoch": 3 }, { "type": "loss", "content": 0.0016930032288655639, "timestamp": "2025-10-01 04:13:50.290258", "step": 2177, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.319935", "step": 2177, "epoch": 3 }, { "type": "loss", "content": 0.07970846444368362, "timestamp": "2025-10-01 04:13:50.322171", "step": 2178, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.352874", "step": 2178, "epoch": 3 }, { "type": "loss", "content": 0.0005008368170820177, "timestamp": "2025-10-01 04:13:50.355091", "step": 2179, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.385190", "step": 2179, "epoch": 3 }, { "type": "loss", "content": 0.0017755134031176567, "timestamp": "2025-10-01 04:13:50.409568", "step": 2180, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.439865", "step": 2180, "epoch": 3 }, { "type": "loss", "content": 0.0006339551182463765, "timestamp": "2025-10-01 04:13:50.442350", "step": 2181, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.472507", "step": 2181, "epoch": 3 }, { "type": "loss", "content": 0.0005257153534330428, "timestamp": "2025-10-01 04:13:50.475103", "step": 2182, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.505242", "step": 2182, "epoch": 3 }, { "type": "loss", "content": 0.0016372958198189735, "timestamp": "2025-10-01 04:13:50.507994", "step": 2183, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.538273", "step": 2183, "epoch": 3 }, { "type": "loss", "content": 0.0017383432714268565, "timestamp": "2025-10-01 04:13:50.561965", "step": 2184, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.592857", "step": 2184, "epoch": 3 }, { "type": "loss", "content": 0.01293744146823883, "timestamp": "2025-10-01 04:13:50.595401", "step": 2185, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.625952", "step": 2185, "epoch": 3 }, { "type": "loss", "content": 0.0013602704275399446, "timestamp": "2025-10-01 04:13:50.628065", "step": 2186, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.660109", "step": 2186, "epoch": 3 }, { "type": "loss", "content": 0.029963940382003784, "timestamp": "2025-10-01 04:13:50.662355", "step": 2187, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.692996", "step": 2187, "epoch": 3 }, { "type": "loss", "content": 0.001546170562505722, "timestamp": "2025-10-01 04:13:50.716697", "step": 2188, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.746866", "step": 2188, "epoch": 3 }, { "type": "loss", "content": 0.0025659650564193726, "timestamp": "2025-10-01 04:13:50.749083", "step": 2189, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.779294", "step": 2189, "epoch": 3 }, { "type": "loss", "content": 0.0019798476714640856, "timestamp": "2025-10-01 04:13:50.781579", "step": 2190, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:50.816634", "step": 2190, "epoch": 3 }, { "type": "loss", "content": 0.021714312955737114, "timestamp": "2025-10-01 04:13:50.819334", "step": 2191, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.849601", "step": 2191, "epoch": 3 }, { "type": "loss", "content": 0.0006570308469235897, "timestamp": "2025-10-01 04:13:50.873553", "step": 2192, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:50.903563", "step": 2192, "epoch": 3 }, { "type": "loss", "content": 0.0003911609819624573, "timestamp": "2025-10-01 04:13:50.906388", "step": 2193, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:50.936361", "step": 2193, "epoch": 3 }, { "type": "loss", "content": 0.012683354318141937, "timestamp": "2025-10-01 04:13:50.938846", "step": 2194, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:50.969034", "step": 2194, "epoch": 3 }, { "type": "loss", "content": 0.0007620817050337791, "timestamp": "2025-10-01 04:13:50.971696", "step": 2195, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.002373", "step": 2195, "epoch": 3 }, { "type": "loss", "content": 0.04474819079041481, "timestamp": "2025-10-01 04:13:51.025900", "step": 2196, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.056712", "step": 2196, "epoch": 3 }, { "type": "loss", "content": 0.02375408448278904, "timestamp": "2025-10-01 04:13:51.058967", "step": 2197, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.089240", "step": 2197, "epoch": 3 }, { "type": "loss", "content": 0.003066360717639327, "timestamp": "2025-10-01 04:13:51.091440", "step": 2198, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.121521", "step": 2198, "epoch": 3 }, { "type": "loss", "content": 0.0017424180405214429, "timestamp": "2025-10-01 04:13:51.124399", "step": 2199, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.155109", "step": 2199, "epoch": 3 }, { "type": "loss", "content": 0.0005147810443304479, "timestamp": "2025-10-01 04:13:51.178823", "step": 2200, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:51.209488", "step": 2200, "epoch": 3 }, { "type": "loss", "content": 0.0011206363560631871, "timestamp": "2025-10-01 04:13:51.214386", "step": 2201, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.245040", "step": 2201, "epoch": 3 }, { "type": "loss", "content": 0.014846226200461388, "timestamp": "2025-10-01 04:13:51.247451", "step": 2202, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.277363", "step": 2202, "epoch": 3 }, { "type": "loss", "content": 0.0005189668736420572, "timestamp": "2025-10-01 04:13:51.280000", "step": 2203, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.310539", "step": 2203, "epoch": 3 }, { "type": "loss", "content": 0.0006393748917616904, "timestamp": "2025-10-01 04:13:51.334449", "step": 2204, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.366533", "step": 2204, "epoch": 3 }, { "type": "loss", "content": 0.001081749564036727, "timestamp": "2025-10-01 04:13:51.368875", "step": 2205, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.398790", "step": 2205, "epoch": 3 }, { "type": "loss", "content": 0.0028890289831906557, "timestamp": "2025-10-01 04:13:51.401159", "step": 2206, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.432261", "step": 2206, "epoch": 3 }, { "type": "loss", "content": 0.0013795385602861643, "timestamp": "2025-10-01 04:13:51.434848", "step": 2207, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.465835", "step": 2207, "epoch": 3 }, { "type": "loss", "content": 0.006973086390644312, "timestamp": "2025-10-01 04:13:51.490177", "step": 2208, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.521034", "step": 2208, "epoch": 3 }, { "type": "loss", "content": 0.0016343960305675864, "timestamp": "2025-10-01 04:13:51.523305", "step": 2209, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.554179", "step": 2209, "epoch": 3 }, { "type": "loss", "content": 0.0010523818200454116, "timestamp": "2025-10-01 04:13:51.557872", "step": 2210, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.592688", "step": 2210, "epoch": 3 }, { "type": "loss", "content": 0.0007708879420533776, "timestamp": "2025-10-01 04:13:51.594986", "step": 2211, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.624798", "step": 2211, "epoch": 3 }, { "type": "loss", "content": 0.001242365688085556, "timestamp": "2025-10-01 04:13:51.648776", "step": 2212, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.683344", "step": 2212, "epoch": 3 }, { "type": "loss", "content": 0.009481081739068031, "timestamp": "2025-10-01 04:13:51.686810", "step": 2213, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.718562", "step": 2213, "epoch": 3 }, { "type": "loss", "content": 0.0021511695813387632, "timestamp": "2025-10-01 04:13:51.721237", "step": 2214, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.751752", "step": 2214, "epoch": 3 }, { "type": "loss", "content": 0.002733413130044937, "timestamp": "2025-10-01 04:13:51.753880", "step": 2215, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:51.783955", "step": 2215, "epoch": 3 }, { "type": "loss", "content": 0.017588740214705467, "timestamp": "2025-10-01 04:13:51.808100", "step": 2216, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.838417", "step": 2216, "epoch": 3 }, { "type": "loss", "content": 0.0065732793882489204, "timestamp": "2025-10-01 04:13:51.841084", "step": 2217, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.872494", "step": 2217, "epoch": 3 }, { "type": "loss", "content": 0.016586460173130035, "timestamp": "2025-10-01 04:13:51.875321", "step": 2218, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:51.912047", "step": 2218, "epoch": 3 }, { "type": "loss", "content": 0.0012413633521646261, "timestamp": "2025-10-01 04:13:51.914928", "step": 2219, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:51.945079", "step": 2219, "epoch": 3 }, { "type": "loss", "content": 0.0022606845013797283, "timestamp": "2025-10-01 04:13:51.970122", "step": 2220, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:52.000245", "step": 2220, "epoch": 3 }, { "type": "loss", "content": 0.000476359884487465, "timestamp": "2025-10-01 04:13:52.002609", "step": 2221, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:52.032633", "step": 2221, "epoch": 3 }, { "type": "loss", "content": 0.0018863745499402285, "timestamp": "2025-10-01 04:13:52.034902", "step": 2222, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:52.065407", "step": 2222, "epoch": 3 }, { "type": "loss", "content": 0.0005490041221491992, "timestamp": "2025-10-01 04:13:52.068087", "step": 2223, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:52.878486", "step": 2223, "epoch": 3 }, { "type": "pplx", "content": 57872429.750115015, "timestamp": "2025-10-01 04:13:52.883516", "step": 2223, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:52.915823", "step": 2223, "epoch": 3 }, { "type": "loss", "content": 0.005303422920405865, "timestamp": "2025-10-01 04:13:52.939665", "step": 2224, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:52.970698", "step": 2224, "epoch": 3 }, { "type": "loss", "content": 0.001239862642250955, "timestamp": "2025-10-01 04:13:52.974647", "step": 2225, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.004930", "step": 2225, "epoch": 3 }, { "type": "loss", "content": 0.0014868489233776927, "timestamp": "2025-10-01 04:13:53.007349", "step": 2226, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.039270", "step": 2226, "epoch": 3 }, { "type": "loss", "content": 0.014227977022528648, "timestamp": "2025-10-01 04:13:53.041520", "step": 2227, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:53.072837", "step": 2227, "epoch": 3 }, { "type": "loss", "content": 0.01482341904193163, "timestamp": "2025-10-01 04:13:53.096873", "step": 2228, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.127041", "step": 2228, "epoch": 3 }, { "type": "loss", "content": 0.00040583201916888356, "timestamp": "2025-10-01 04:13:53.129207", "step": 2229, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.159992", "step": 2229, "epoch": 3 }, { "type": "loss", "content": 0.001420902437530458, "timestamp": "2025-10-01 04:13:53.163552", "step": 2230, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.196656", "step": 2230, "epoch": 3 }, { "type": "loss", "content": 0.00013874395517632365, "timestamp": "2025-10-01 04:13:53.198927", "step": 2231, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:53.230201", "step": 2231, "epoch": 3 }, { "type": "loss", "content": 0.018192876130342484, "timestamp": "2025-10-01 04:13:53.254018", "step": 2232, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.284629", "step": 2232, "epoch": 3 }, { "type": "loss", "content": 0.0024055861867964268, "timestamp": "2025-10-01 04:13:53.286870", "step": 2233, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.319096", "step": 2233, "epoch": 3 }, { "type": "loss", "content": 0.0012060150038450956, "timestamp": "2025-10-01 04:13:53.321182", "step": 2234, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.351973", "step": 2234, "epoch": 3 }, { "type": "loss", "content": 0.001272205961868167, "timestamp": "2025-10-01 04:13:53.354100", "step": 2235, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:53.385556", "step": 2235, "epoch": 3 }, { "type": "loss", "content": 0.0032269915100187063, "timestamp": "2025-10-01 04:13:53.412538", "step": 2236, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:53.443382", "step": 2236, "epoch": 3 }, { "type": "loss", "content": 0.001299424795433879, "timestamp": "2025-10-01 04:13:53.445926", "step": 2237, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.477086", "step": 2237, "epoch": 3 }, { "type": "loss", "content": 0.0036020518746227026, "timestamp": "2025-10-01 04:13:53.479343", "step": 2238, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:53.511060", "step": 2238, "epoch": 3 }, { "type": "loss", "content": 0.027935948222875595, "timestamp": "2025-10-01 04:13:53.513294", "step": 2239, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:53.544020", "step": 2239, "epoch": 3 }, { "type": "loss", "content": 0.003648341167718172, "timestamp": "2025-10-01 04:13:53.567927", "step": 2240, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.599405", "step": 2240, "epoch": 3 }, { "type": "loss", "content": 0.0017400241922587156, "timestamp": "2025-10-01 04:13:53.601898", "step": 2241, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:53.633111", "step": 2241, "epoch": 3 }, { "type": "loss", "content": 0.00026442264788784087, "timestamp": "2025-10-01 04:13:53.635470", "step": 2242, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.666920", "step": 2242, "epoch": 3 }, { "type": "loss", "content": 0.0010703576263040304, "timestamp": "2025-10-01 04:13:53.669857", "step": 2243, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.700975", "step": 2243, "epoch": 3 }, { "type": "loss", "content": 0.0018084843177348375, "timestamp": "2025-10-01 04:13:53.724547", "step": 2244, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:53.754331", "step": 2244, "epoch": 3 }, { "type": "loss", "content": 0.004137086682021618, "timestamp": "2025-10-01 04:13:53.756585", "step": 2245, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:53.787015", "step": 2245, "epoch": 3 }, { "type": "loss", "content": 0.004625528585165739, "timestamp": "2025-10-01 04:13:53.789622", "step": 2246, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.819494", "step": 2246, "epoch": 3 }, { "type": "loss", "content": 0.002892305376008153, "timestamp": "2025-10-01 04:13:53.821935", "step": 2247, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.852186", "step": 2247, "epoch": 3 }, { "type": "loss", "content": 0.010060891509056091, "timestamp": "2025-10-01 04:13:53.876285", "step": 2248, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.908538", "step": 2248, "epoch": 3 }, { "type": "loss", "content": 0.010718696750700474, "timestamp": "2025-10-01 04:13:53.911889", "step": 2249, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.943796", "step": 2249, "epoch": 3 }, { "type": "loss", "content": 0.0001337795110885054, "timestamp": "2025-10-01 04:13:53.946157", "step": 2250, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:53.976787", "step": 2250, "epoch": 3 }, { "type": "loss", "content": 0.004378857556730509, "timestamp": "2025-10-01 04:13:53.979403", "step": 2251, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.009828", "step": 2251, "epoch": 3 }, { "type": "loss", "content": 0.005483386572450399, "timestamp": "2025-10-01 04:13:54.033818", "step": 2252, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.065109", "step": 2252, "epoch": 3 }, { "type": "loss", "content": 0.0017293499549850821, "timestamp": "2025-10-01 04:13:54.067304", "step": 2253, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:54.100189", "step": 2253, "epoch": 3 }, { "type": "loss", "content": 0.0008020902168937027, "timestamp": "2025-10-01 04:13:54.102844", "step": 2254, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.133014", "step": 2254, "epoch": 3 }, { "type": "loss", "content": 0.006860501132905483, "timestamp": "2025-10-01 04:13:54.135596", "step": 2255, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:54.165944", "step": 2255, "epoch": 3 }, { "type": "loss", "content": 0.00011491885379655287, "timestamp": "2025-10-01 04:13:54.189484", "step": 2256, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:54.220908", "step": 2256, "epoch": 3 }, { "type": "loss", "content": 0.0001829215616453439, "timestamp": "2025-10-01 04:13:54.223228", "step": 2257, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:54.255147", "step": 2257, "epoch": 3 }, { "type": "loss", "content": 0.006004713010042906, "timestamp": "2025-10-01 04:13:54.257397", "step": 2258, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:54.288126", "step": 2258, "epoch": 3 }, { "type": "loss", "content": 0.0025731585919857025, "timestamp": "2025-10-01 04:13:54.290963", "step": 2259, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:54.321128", "step": 2259, "epoch": 3 }, { "type": "loss", "content": 0.005405474919825792, "timestamp": "2025-10-01 04:13:54.344849", "step": 2260, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.376244", "step": 2260, "epoch": 3 }, { "type": "loss", "content": 0.001444359077140689, "timestamp": "2025-10-01 04:13:54.378603", "step": 2261, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.408929", "step": 2261, "epoch": 3 }, { "type": "loss", "content": 0.0011323849903419614, "timestamp": "2025-10-01 04:13:54.411072", "step": 2262, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.440934", "step": 2262, "epoch": 3 }, { "type": "loss", "content": 0.004098639357835054, "timestamp": "2025-10-01 04:13:54.443178", "step": 2263, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:54.474479", "step": 2263, "epoch": 3 }, { "type": "loss", "content": 0.0014724934007972479, "timestamp": "2025-10-01 04:13:54.498620", "step": 2264, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.528900", "step": 2264, "epoch": 3 }, { "type": "loss", "content": 0.00010470008419360965, "timestamp": "2025-10-01 04:13:54.531177", "step": 2265, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:54.561929", "step": 2265, "epoch": 3 }, { "type": "loss", "content": 0.0005662904004566371, "timestamp": "2025-10-01 04:13:54.564673", "step": 2266, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.595382", "step": 2266, "epoch": 3 }, { "type": "loss", "content": 0.02000434696674347, "timestamp": "2025-10-01 04:13:54.597915", "step": 2267, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:54.628014", "step": 2267, "epoch": 3 }, { "type": "loss", "content": 0.003685658099129796, "timestamp": "2025-10-01 04:13:54.651964", "step": 2268, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.681973", "step": 2268, "epoch": 3 }, { "type": "loss", "content": 0.00028314744122326374, "timestamp": "2025-10-01 04:13:54.684005", "step": 2269, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.715355", "step": 2269, "epoch": 3 }, { "type": "loss", "content": 0.0002110866189468652, "timestamp": "2025-10-01 04:13:54.717723", "step": 2270, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.748149", "step": 2270, "epoch": 3 }, { "type": "loss", "content": 0.000801309070084244, "timestamp": "2025-10-01 04:13:54.750891", "step": 2271, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.781274", "step": 2271, "epoch": 3 }, { "type": "loss", "content": 0.0008623714675195515, "timestamp": "2025-10-01 04:13:54.805287", "step": 2272, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.837178", "step": 2272, "epoch": 3 }, { "type": "loss", "content": 0.00174813368357718, "timestamp": "2025-10-01 04:13:54.839371", "step": 2273, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.869678", "step": 2273, "epoch": 3 }, { "type": "loss", "content": 0.001442483626306057, "timestamp": "2025-10-01 04:13:54.872114", "step": 2274, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.902878", "step": 2274, "epoch": 3 }, { "type": "loss", "content": 0.0007527487468905747, "timestamp": "2025-10-01 04:13:54.905230", "step": 2275, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:54.936627", "step": 2275, "epoch": 3 }, { "type": "loss", "content": 0.0018074663821607828, "timestamp": "2025-10-01 04:13:54.960309", "step": 2276, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:54.993177", "step": 2276, "epoch": 3 }, { "type": "loss", "content": 0.00037215909105725586, "timestamp": "2025-10-01 04:13:54.995943", "step": 2277, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:55.027011", "step": 2277, "epoch": 3 }, { "type": "loss", "content": 0.01679880917072296, "timestamp": "2025-10-01 04:13:55.029564", "step": 2278, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:55.060916", "step": 2278, "epoch": 3 }, { "type": "loss", "content": 0.0008941438281908631, "timestamp": "2025-10-01 04:13:55.063248", "step": 2279, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:55.094396", "step": 2279, "epoch": 3 }, { "type": "loss", "content": 0.0006860237335786223, "timestamp": "2025-10-01 04:13:55.118517", "step": 2280, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:55.947999", "step": 2280, "epoch": 3 }, { "type": "pplx", "content": 66637274.69816364, "timestamp": "2025-10-01 04:13:55.950876", "step": 2280, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:55.981527", "step": 2280, "epoch": 3 }, { "type": "loss", "content": 0.0006886274204589427, "timestamp": "2025-10-01 04:13:55.984672", "step": 2281, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.015998", "step": 2281, "epoch": 3 }, { "type": "loss", "content": 0.004052779637277126, "timestamp": "2025-10-01 04:13:56.018978", "step": 2282, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:56.051788", "step": 2282, "epoch": 3 }, { "type": "loss", "content": 0.00020752070122398436, "timestamp": "2025-10-01 04:13:56.054491", "step": 2283, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.088631", "step": 2283, "epoch": 3 }, { "type": "loss", "content": 0.0004296154365874827, "timestamp": "2025-10-01 04:13:56.114021", "step": 2284, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.148318", "step": 2284, "epoch": 3 }, { "type": "loss", "content": 0.0006741755059920251, "timestamp": "2025-10-01 04:13:56.151630", "step": 2285, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.184792", "step": 2285, "epoch": 3 }, { "type": "loss", "content": 0.0002825469709932804, "timestamp": "2025-10-01 04:13:56.188132", "step": 2286, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.221273", "step": 2286, "epoch": 3 }, { "type": "loss", "content": 0.00020196287368889898, "timestamp": "2025-10-01 04:13:56.224908", "step": 2287, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:13:56.258003", "step": 2287, "epoch": 3 }, { "type": "loss", "content": 0.0006172371795400977, "timestamp": "2025-10-01 04:13:56.284258", "step": 2288, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:56.326859", "step": 2288, "epoch": 3 }, { "type": "loss", "content": 0.00020179711282253265, "timestamp": "2025-10-01 04:13:56.330074", "step": 2289, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.363187", "step": 2289, "epoch": 3 }, { "type": "loss", "content": 0.0001798906596377492, "timestamp": "2025-10-01 04:13:56.366055", "step": 2290, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.398175", "step": 2290, "epoch": 3 }, { "type": "loss", "content": 0.0007496423204429448, "timestamp": "2025-10-01 04:13:56.401033", "step": 2291, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:56.433795", "step": 2291, "epoch": 3 }, { "type": "loss", "content": 0.0031232612673193216, "timestamp": "2025-10-01 04:13:56.458085", "step": 2292, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.490985", "step": 2292, "epoch": 3 }, { "type": "loss", "content": 0.00034845713526010513, "timestamp": "2025-10-01 04:13:56.493978", "step": 2293, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:56.527730", "step": 2293, "epoch": 3 }, { "type": "loss", "content": 0.002042332198470831, "timestamp": "2025-10-01 04:13:56.530788", "step": 2294, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.564274", "step": 2294, "epoch": 3 }, { "type": "loss", "content": 0.0036737422924488783, "timestamp": "2025-10-01 04:13:56.567665", "step": 2295, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.599395", "step": 2295, "epoch": 3 }, { "type": "loss", "content": 0.00028754607774317265, "timestamp": "2025-10-01 04:13:56.623419", "step": 2296, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.656536", "step": 2296, "epoch": 3 }, { "type": "loss", "content": 0.0005289117689244449, "timestamp": "2025-10-01 04:13:56.659456", "step": 2297, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.691957", "step": 2297, "epoch": 3 }, { "type": "loss", "content": 0.001023622928187251, "timestamp": "2025-10-01 04:13:56.695324", "step": 2298, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.728713", "step": 2298, "epoch": 3 }, { "type": "loss", "content": 0.00013159470108803362, "timestamp": "2025-10-01 04:13:56.731160", "step": 2299, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.763634", "step": 2299, "epoch": 3 }, { "type": "loss", "content": 0.0003687890130095184, "timestamp": "2025-10-01 04:13:56.788127", "step": 2300, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.820633", "step": 2300, "epoch": 3 }, { "type": "loss", "content": 0.00016187668370548636, "timestamp": "2025-10-01 04:13:56.823444", "step": 2301, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.857460", "step": 2301, "epoch": 3 }, { "type": "loss", "content": 0.00014664800255559385, "timestamp": "2025-10-01 04:13:56.859740", "step": 2302, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.890101", "step": 2302, "epoch": 3 }, { "type": "loss", "content": 0.0200524739921093, "timestamp": "2025-10-01 04:13:56.892366", "step": 2303, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.922653", "step": 2303, "epoch": 3 }, { "type": "loss", "content": 0.0009536169818602502, "timestamp": "2025-10-01 04:13:56.946284", "step": 2304, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:56.977135", "step": 2304, "epoch": 3 }, { "type": "loss", "content": 0.00014004905824549496, "timestamp": "2025-10-01 04:13:56.979796", "step": 2305, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.011129", "step": 2305, "epoch": 3 }, { "type": "loss", "content": 0.0004129658918827772, "timestamp": "2025-10-01 04:13:57.014211", "step": 2306, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:57.044673", "step": 2306, "epoch": 3 }, { "type": "loss", "content": 0.0029463896062225103, "timestamp": "2025-10-01 04:13:57.047446", "step": 2307, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.078767", "step": 2307, "epoch": 3 }, { "type": "loss", "content": 0.0015465685864910483, "timestamp": "2025-10-01 04:13:57.103049", "step": 2308, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.134295", "step": 2308, "epoch": 3 }, { "type": "loss", "content": 0.00033053886727429926, "timestamp": "2025-10-01 04:13:57.136656", "step": 2309, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.167490", "step": 2309, "epoch": 3 }, { "type": "loss", "content": 0.0004535024636425078, "timestamp": "2025-10-01 04:13:57.170113", "step": 2310, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.200399", "step": 2310, "epoch": 3 }, { "type": "loss", "content": 0.00019416247960180044, "timestamp": "2025-10-01 04:13:57.202965", "step": 2311, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.233831", "step": 2311, "epoch": 3 }, { "type": "loss", "content": 0.000307450391119346, "timestamp": "2025-10-01 04:13:57.257717", "step": 2312, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.289110", "step": 2312, "epoch": 3 }, { "type": "loss", "content": 0.0004815698484890163, "timestamp": "2025-10-01 04:13:57.291523", "step": 2313, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.322062", "step": 2313, "epoch": 3 }, { "type": "loss", "content": 9.445106115890667e-05, "timestamp": "2025-10-01 04:13:57.324496", "step": 2314, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.355461", "step": 2314, "epoch": 3 }, { "type": "loss", "content": 0.00887442845851183, "timestamp": "2025-10-01 04:13:57.358252", "step": 2315, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.389065", "step": 2315, "epoch": 3 }, { "type": "loss", "content": 0.0002211143437307328, "timestamp": "2025-10-01 04:13:57.412957", "step": 2316, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.445065", "step": 2316, "epoch": 3 }, { "type": "loss", "content": 0.00011029041343135759, "timestamp": "2025-10-01 04:13:57.447586", "step": 2317, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.480235", "step": 2317, "epoch": 3 }, { "type": "loss", "content": 0.00045324538950808346, "timestamp": "2025-10-01 04:13:57.482531", "step": 2318, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.513974", "step": 2318, "epoch": 3 }, { "type": "loss", "content": 0.0002062932326225564, "timestamp": "2025-10-01 04:13:57.516597", "step": 2319, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.548161", "step": 2319, "epoch": 3 }, { "type": "loss", "content": 0.0019155825721099973, "timestamp": "2025-10-01 04:13:57.572038", "step": 2320, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:57.603052", "step": 2320, "epoch": 3 }, { "type": "loss", "content": 0.0013960811775177717, "timestamp": "2025-10-01 04:13:57.605522", "step": 2321, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.635696", "step": 2321, "epoch": 3 }, { "type": "loss", "content": 0.00019416447321418673, "timestamp": "2025-10-01 04:13:57.638213", "step": 2322, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.668572", "step": 2322, "epoch": 3 }, { "type": "loss", "content": 0.005333646200597286, "timestamp": "2025-10-01 04:13:57.670929", "step": 2323, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.701421", "step": 2323, "epoch": 3 }, { "type": "loss", "content": 0.00022433795675169677, "timestamp": "2025-10-01 04:13:57.725539", "step": 2324, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.756019", "step": 2324, "epoch": 3 }, { "type": "loss", "content": 0.0015846246387809515, "timestamp": "2025-10-01 04:13:57.758910", "step": 2325, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.790254", "step": 2325, "epoch": 3 }, { "type": "loss", "content": 0.001096657244488597, "timestamp": "2025-10-01 04:13:57.792998", "step": 2326, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.824209", "step": 2326, "epoch": 3 }, { "type": "loss", "content": 0.007495674304664135, "timestamp": "2025-10-01 04:13:57.826585", "step": 2327, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.857064", "step": 2327, "epoch": 3 }, { "type": "loss", "content": 0.007593679241836071, "timestamp": "2025-10-01 04:13:57.880922", "step": 2328, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.911681", "step": 2328, "epoch": 3 }, { "type": "loss", "content": 0.000178185073309578, "timestamp": "2025-10-01 04:13:57.913868", "step": 2329, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:57.943925", "step": 2329, "epoch": 3 }, { "type": "loss", "content": 0.00029983054264448583, "timestamp": "2025-10-01 04:13:57.946258", "step": 2330, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:57.976362", "step": 2330, "epoch": 3 }, { "type": "loss", "content": 0.0005218511214479804, "timestamp": "2025-10-01 04:13:57.979122", "step": 2331, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:58.014030", "step": 2331, "epoch": 3 }, { "type": "loss", "content": 0.0006095552816987038, "timestamp": "2025-10-01 04:13:58.038114", "step": 2332, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:58.069940", "step": 2332, "epoch": 3 }, { "type": "loss", "content": 0.0006886310293339193, "timestamp": "2025-10-01 04:13:58.072385", "step": 2333, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:58.107624", "step": 2333, "epoch": 3 }, { "type": "loss", "content": 0.0004724813625216484, "timestamp": "2025-10-01 04:13:58.110276", "step": 2334, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:58.142440", "step": 2334, "epoch": 3 }, { "type": "loss", "content": 0.0001418648025719449, "timestamp": "2025-10-01 04:13:58.145038", "step": 2335, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:58.176299", "step": 2335, "epoch": 3 }, { "type": "loss", "content": 0.007086599711328745, "timestamp": "2025-10-01 04:13:58.200243", "step": 2336, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:58.231939", "step": 2336, "epoch": 3 }, { "type": "loss", "content": 0.0019127464620396495, "timestamp": "2025-10-01 04:13:58.234561", "step": 2337, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:13:58.987132", "step": 2337, "epoch": 3 }, { "type": "pplx", "content": 69247012.9575338, "timestamp": "2025-10-01 04:13:58.990008", "step": 2337, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.019342", "step": 2337, "epoch": 3 }, { "type": "loss", "content": 0.0002083075523842126, "timestamp": "2025-10-01 04:13:59.021841", "step": 2338, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.052298", "step": 2338, "epoch": 3 }, { "type": "loss", "content": 8.348600385943428e-05, "timestamp": "2025-10-01 04:13:59.054761", "step": 2339, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.085523", "step": 2339, "epoch": 3 }, { "type": "loss", "content": 0.01127583347260952, "timestamp": "2025-10-01 04:13:59.111836", "step": 2340, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.142895", "step": 2340, "epoch": 3 }, { "type": "loss", "content": 0.008107397705316544, "timestamp": "2025-10-01 04:13:59.145588", "step": 2341, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:59.176979", "step": 2341, "epoch": 3 }, { "type": "loss", "content": 0.02429473027586937, "timestamp": "2025-10-01 04:13:59.179351", "step": 2342, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.211977", "step": 2342, "epoch": 3 }, { "type": "loss", "content": 0.00030902380240149796, "timestamp": "2025-10-01 04:13:59.214422", "step": 2343, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.244579", "step": 2343, "epoch": 3 }, { "type": "loss", "content": 0.0004113315953873098, "timestamp": "2025-10-01 04:13:59.268573", "step": 2344, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.300573", "step": 2344, "epoch": 3 }, { "type": "loss", "content": 0.046647004783153534, "timestamp": "2025-10-01 04:13:59.303872", "step": 2345, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.336321", "step": 2345, "epoch": 3 }, { "type": "loss", "content": 0.0014374825404956937, "timestamp": "2025-10-01 04:13:59.339699", "step": 2346, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.371559", "step": 2346, "epoch": 3 }, { "type": "loss", "content": 6.242156814550981e-05, "timestamp": "2025-10-01 04:13:59.375361", "step": 2347, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.406163", "step": 2347, "epoch": 3 }, { "type": "loss", "content": 0.05559505149722099, "timestamp": "2025-10-01 04:13:59.431596", "step": 2348, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.463205", "step": 2348, "epoch": 3 }, { "type": "loss", "content": 0.00017471036699134856, "timestamp": "2025-10-01 04:13:59.466400", "step": 2349, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.500536", "step": 2349, "epoch": 3 }, { "type": "loss", "content": 0.06607859581708908, "timestamp": "2025-10-01 04:13:59.503331", "step": 2350, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:13:59.534609", "step": 2350, "epoch": 3 }, { "type": "loss", "content": 0.0028158756904304028, "timestamp": "2025-10-01 04:13:59.537181", "step": 2351, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.568813", "step": 2351, "epoch": 3 }, { "type": "loss", "content": 0.0007161787361837924, "timestamp": "2025-10-01 04:13:59.592904", "step": 2352, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.625572", "step": 2352, "epoch": 3 }, { "type": "loss", "content": 6.582152127521113e-05, "timestamp": "2025-10-01 04:13:59.627797", "step": 2353, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.660698", "step": 2353, "epoch": 3 }, { "type": "loss", "content": 0.016403676941990852, "timestamp": "2025-10-01 04:13:59.663445", "step": 2354, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.694490", "step": 2354, "epoch": 3 }, { "type": "loss", "content": 0.00014711846597492695, "timestamp": "2025-10-01 04:13:59.697214", "step": 2355, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.727858", "step": 2355, "epoch": 3 }, { "type": "loss", "content": 8.415959746344015e-05, "timestamp": "2025-10-01 04:13:59.752319", "step": 2356, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.786727", "step": 2356, "epoch": 3 }, { "type": "loss", "content": 0.00010823925549630076, "timestamp": "2025-10-01 04:13:59.789310", "step": 2357, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.824218", "step": 2357, "epoch": 3 }, { "type": "loss", "content": 8.591300866100937e-05, "timestamp": "2025-10-01 04:13:59.828789", "step": 2358, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.861217", "step": 2358, "epoch": 3 }, { "type": "loss", "content": 0.00018384023860562593, "timestamp": "2025-10-01 04:13:59.863539", "step": 2359, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.895521", "step": 2359, "epoch": 3 }, { "type": "loss", "content": 0.024127814918756485, "timestamp": "2025-10-01 04:13:59.919735", "step": 2360, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:13:59.955433", "step": 2360, "epoch": 3 }, { "type": "loss", "content": 8.21871726657264e-05, "timestamp": "2025-10-01 04:13:59.958042", "step": 2361, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:13:59.989529", "step": 2361, "epoch": 3 }, { "type": "loss", "content": 0.0018586823716759682, "timestamp": "2025-10-01 04:13:59.993816", "step": 2362, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.024887", "step": 2362, "epoch": 3 }, { "type": "loss", "content": 0.00025929472758434713, "timestamp": "2025-10-01 04:14:00.027665", "step": 2363, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.058808", "step": 2363, "epoch": 3 }, { "type": "loss", "content": 0.002079632831737399, "timestamp": "2025-10-01 04:14:00.083423", "step": 2364, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.115034", "step": 2364, "epoch": 3 }, { "type": "loss", "content": 0.02933317981660366, "timestamp": "2025-10-01 04:14:00.117653", "step": 2365, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.148211", "step": 2365, "epoch": 3 }, { "type": "loss", "content": 0.013687198050320148, "timestamp": "2025-10-01 04:14:00.150738", "step": 2366, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.182206", "step": 2366, "epoch": 3 }, { "type": "loss", "content": 0.0029233102686703205, "timestamp": "2025-10-01 04:14:00.185068", "step": 2367, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.215640", "step": 2367, "epoch": 3 }, { "type": "loss", "content": 0.010431193746626377, "timestamp": "2025-10-01 04:14:00.239646", "step": 2368, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.270415", "step": 2368, "epoch": 3 }, { "type": "loss", "content": 0.00024859068798832595, "timestamp": "2025-10-01 04:14:00.273133", "step": 2369, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.304287", "step": 2369, "epoch": 3 }, { "type": "loss", "content": 0.0006030590157024562, "timestamp": "2025-10-01 04:14:00.306784", "step": 2370, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.337195", "step": 2370, "epoch": 3 }, { "type": "loss", "content": 0.043975941836833954, "timestamp": "2025-10-01 04:14:00.340000", "step": 2371, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.370964", "step": 2371, "epoch": 3 }, { "type": "loss", "content": 0.0028401927556842566, "timestamp": "2025-10-01 04:14:00.394905", "step": 2372, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.426091", "step": 2372, "epoch": 3 }, { "type": "loss", "content": 0.07099797576665878, "timestamp": "2025-10-01 04:14:00.428473", "step": 2373, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.458817", "step": 2373, "epoch": 3 }, { "type": "loss", "content": 0.0027063204906880856, "timestamp": "2025-10-01 04:14:00.462017", "step": 2374, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.492093", "step": 2374, "epoch": 3 }, { "type": "loss", "content": 0.09009862691164017, "timestamp": "2025-10-01 04:14:00.495027", "step": 2375, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.525451", "step": 2375, "epoch": 3 }, { "type": "loss", "content": 0.018117502331733704, "timestamp": "2025-10-01 04:14:00.549383", "step": 2376, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.580272", "step": 2376, "epoch": 3 }, { "type": "loss", "content": 0.0008630921947769821, "timestamp": "2025-10-01 04:14:00.582748", "step": 2377, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.613816", "step": 2377, "epoch": 3 }, { "type": "loss", "content": 0.04184184595942497, "timestamp": "2025-10-01 04:14:00.616387", "step": 2378, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.647532", "step": 2378, "epoch": 3 }, { "type": "loss", "content": 0.0033638239838182926, "timestamp": "2025-10-01 04:14:00.651025", "step": 2379, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.682110", "step": 2379, "epoch": 3 }, { "type": "loss", "content": 0.003705172333866358, "timestamp": "2025-10-01 04:14:00.706053", "step": 2380, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.737177", "step": 2380, "epoch": 3 }, { "type": "loss", "content": 0.013101239688694477, "timestamp": "2025-10-01 04:14:00.739782", "step": 2381, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:00.771275", "step": 2381, "epoch": 3 }, { "type": "loss", "content": 0.003192309755831957, "timestamp": "2025-10-01 04:14:00.773775", "step": 2382, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.805257", "step": 2382, "epoch": 3 }, { "type": "loss", "content": 0.024804024025797844, "timestamp": "2025-10-01 04:14:00.807975", "step": 2383, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.839636", "step": 2383, "epoch": 3 }, { "type": "loss", "content": 0.016119828447699547, "timestamp": "2025-10-01 04:14:00.863745", "step": 2384, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.895939", "step": 2384, "epoch": 3 }, { "type": "loss", "content": 0.005999320652335882, "timestamp": "2025-10-01 04:14:00.898325", "step": 2385, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.929460", "step": 2385, "epoch": 3 }, { "type": "loss", "content": 0.011366310529410839, "timestamp": "2025-10-01 04:14:00.931995", "step": 2386, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:00.964068", "step": 2386, "epoch": 3 }, { "type": "loss", "content": 0.021910708397626877, "timestamp": "2025-10-01 04:14:00.966187", "step": 2387, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:00.997241", "step": 2387, "epoch": 3 }, { "type": "loss", "content": 0.01992500014603138, "timestamp": "2025-10-01 04:14:01.021334", "step": 2388, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:01.053577", "step": 2388, "epoch": 3 }, { "type": "loss", "content": 0.009192356839776039, "timestamp": "2025-10-01 04:14:01.056098", "step": 2389, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:01.087673", "step": 2389, "epoch": 3 }, { "type": "loss", "content": 0.0116264121606946, "timestamp": "2025-10-01 04:14:01.090460", "step": 2390, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:01.122261", "step": 2390, "epoch": 3 }, { "type": "loss", "content": 0.011775086633861065, "timestamp": "2025-10-01 04:14:01.124701", "step": 2391, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:01.156100", "step": 2391, "epoch": 3 }, { "type": "loss", "content": 0.015654785558581352, "timestamp": "2025-10-01 04:14:01.180111", "step": 2392, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:01.211124", "step": 2392, "epoch": 3 }, { "type": "loss", "content": 0.014394355937838554, "timestamp": "2025-10-01 04:14:01.215194", "step": 2393, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:01.245990", "step": 2393, "epoch": 3 }, { "type": "loss", "content": 0.007884612306952477, "timestamp": "2025-10-01 04:14:01.248763", "step": 2394, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:02.017783", "step": 2394, "epoch": 3 }, { "type": "pplx", "content": 52674423.42730978, "timestamp": "2025-10-01 04:14:02.019827", "step": 2394, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.050004", "step": 2394, "epoch": 3 }, { "type": "loss", "content": 0.015185212716460228, "timestamp": "2025-10-01 04:14:02.052314", "step": 2395, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.085021", "step": 2395, "epoch": 3 }, { "type": "loss", "content": 0.008075162768363953, "timestamp": "2025-10-01 04:14:02.109345", "step": 2396, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.143533", "step": 2396, "epoch": 3 }, { "type": "loss", "content": 0.012511581182479858, "timestamp": "2025-10-01 04:14:02.146093", "step": 2397, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.179979", "step": 2397, "epoch": 3 }, { "type": "loss", "content": 0.011734097264707088, "timestamp": "2025-10-01 04:14:02.182677", "step": 2398, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:02.216565", "step": 2398, "epoch": 3 }, { "type": "loss", "content": 0.01114694681018591, "timestamp": "2025-10-01 04:14:02.219096", "step": 2399, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.251696", "step": 2399, "epoch": 3 }, { "type": "loss", "content": 0.009856194257736206, "timestamp": "2025-10-01 04:14:02.276049", "step": 2400, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.311475", "step": 2400, "epoch": 3 }, { "type": "loss", "content": 0.007763002533465624, "timestamp": "2025-10-01 04:14:02.314508", "step": 2401, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:14:02.347485", "step": 2401, "epoch": 3 }, { "type": "loss", "content": 0.0154880927875638, "timestamp": "2025-10-01 04:14:02.350438", "step": 2402, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.384076", "step": 2402, "epoch": 3 }, { "type": "loss", "content": 0.014285017736256123, "timestamp": "2025-10-01 04:14:02.386525", "step": 2403, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.419442", "step": 2403, "epoch": 3 }, { "type": "loss", "content": 0.008886818774044514, "timestamp": "2025-10-01 04:14:02.443889", "step": 2404, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.477835", "step": 2404, "epoch": 3 }, { "type": "loss", "content": 0.00973739568144083, "timestamp": "2025-10-01 04:14:02.480309", "step": 2405, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.527693", "step": 2405, "epoch": 3 }, { "type": "loss", "content": 0.01172753144055605, "timestamp": "2025-10-01 04:14:02.534089", "step": 2406, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:02.579922", "step": 2406, "epoch": 3 }, { "type": "loss", "content": 0.009691670536994934, "timestamp": "2025-10-01 04:14:02.583021", "step": 2407, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.615011", "step": 2407, "epoch": 3 }, { "type": "loss", "content": 0.008152500726282597, "timestamp": "2025-10-01 04:14:02.640631", "step": 2408, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.679784", "step": 2408, "epoch": 3 }, { "type": "loss", "content": 0.011871717870235443, "timestamp": "2025-10-01 04:14:02.681970", "step": 2409, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.715916", "step": 2409, "epoch": 3 }, { "type": "loss", "content": 0.018376769497990608, "timestamp": "2025-10-01 04:14:02.718064", "step": 2410, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:02.750256", "step": 2410, "epoch": 3 }, { "type": "loss", "content": 0.010106426663696766, "timestamp": "2025-10-01 04:14:02.754437", "step": 2411, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:14:02.785988", "step": 2411, "epoch": 3 }, { "type": "loss", "content": 0.01129311416298151, "timestamp": "2025-10-01 04:14:02.810005", "step": 2412, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.850051", "step": 2412, "epoch": 3 }, { "type": "loss", "content": 0.008512974716722965, "timestamp": "2025-10-01 04:14:02.854452", "step": 2413, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.886225", "step": 2413, "epoch": 3 }, { "type": "loss", "content": 0.006000932771712542, "timestamp": "2025-10-01 04:14:02.889066", "step": 2414, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:02.920294", "step": 2414, "epoch": 3 }, { "type": "loss", "content": 0.007568769156932831, "timestamp": "2025-10-01 04:14:02.923494", "step": 2415, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:02.954710", "step": 2415, "epoch": 3 }, { "type": "loss", "content": 0.004484867211431265, "timestamp": "2025-10-01 04:14:02.978807", "step": 2416, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:03.011179", "step": 2416, "epoch": 3 }, { "type": "loss", "content": 0.006643087603151798, "timestamp": "2025-10-01 04:14:03.013504", "step": 2417, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:03.047362", "step": 2417, "epoch": 3 }, { "type": "loss", "content": 0.007000813726335764, "timestamp": "2025-10-01 04:14:03.049777", "step": 2418, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.081423", "step": 2418, "epoch": 3 }, { "type": "loss", "content": 0.012717373669147491, "timestamp": "2025-10-01 04:14:03.084148", "step": 2419, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.115610", "step": 2419, "epoch": 3 }, { "type": "loss", "content": 0.005935595370829105, "timestamp": "2025-10-01 04:14:03.142767", "step": 2420, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.176168", "step": 2420, "epoch": 3 }, { "type": "loss", "content": 0.004426135681569576, "timestamp": "2025-10-01 04:14:03.178773", "step": 2421, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:03.210338", "step": 2421, "epoch": 3 }, { "type": "loss", "content": 0.024797197431325912, "timestamp": "2025-10-01 04:14:03.212759", "step": 2422, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.244167", "step": 2422, "epoch": 3 }, { "type": "loss", "content": 0.005650327540934086, "timestamp": "2025-10-01 04:14:03.246269", "step": 2423, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.278142", "step": 2423, "epoch": 3 }, { "type": "loss", "content": 0.006938618142157793, "timestamp": "2025-10-01 04:14:03.302270", "step": 2424, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 176 ], "flops": 5220903722048 }, "timestamp": "2025-10-01 04:14:03.333865", "step": 2424, "epoch": 3 }, { "type": "loss", "content": 0.0073814005590975285, "timestamp": "2025-10-01 04:14:03.341872", "step": 2425, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.374439", "step": 2425, "epoch": 3 }, { "type": "loss", "content": 0.004846468102186918, "timestamp": "2025-10-01 04:14:03.379119", "step": 2426, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.410258", "step": 2426, "epoch": 3 }, { "type": "loss", "content": 0.018962262198328972, "timestamp": "2025-10-01 04:14:03.412977", "step": 2427, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:03.457844", "step": 2427, "epoch": 3 }, { "type": "loss", "content": 0.0020291144028306007, "timestamp": "2025-10-01 04:14:03.481960", "step": 2428, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.514020", "step": 2428, "epoch": 3 }, { "type": "loss", "content": 0.011502564884722233, "timestamp": "2025-10-01 04:14:03.516281", "step": 2429, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.547165", "step": 2429, "epoch": 3 }, { "type": "loss", "content": 0.015590599738061428, "timestamp": "2025-10-01 04:14:03.550344", "step": 2430, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:03.583905", "step": 2430, "epoch": 3 }, { "type": "loss", "content": 0.004602809436619282, "timestamp": "2025-10-01 04:14:03.586653", "step": 2431, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.620359", "step": 2431, "epoch": 3 }, { "type": "loss", "content": 0.0069076851941645145, "timestamp": "2025-10-01 04:14:03.644622", "step": 2432, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:03.679199", "step": 2432, "epoch": 3 }, { "type": "loss", "content": 0.014969917014241219, "timestamp": "2025-10-01 04:14:03.681570", "step": 2433, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.714456", "step": 2433, "epoch": 3 }, { "type": "loss", "content": 0.003453887300565839, "timestamp": "2025-10-01 04:14:03.717138", "step": 2434, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.748953", "step": 2434, "epoch": 3 }, { "type": "loss", "content": 0.00447799963876605, "timestamp": "2025-10-01 04:14:03.751378", "step": 2435, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.781410", "step": 2435, "epoch": 3 }, { "type": "loss", "content": 0.008956313133239746, "timestamp": "2025-10-01 04:14:03.805616", "step": 2436, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.842161", "step": 2436, "epoch": 3 }, { "type": "loss", "content": 0.001485818182118237, "timestamp": "2025-10-01 04:14:03.844463", "step": 2437, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.875202", "step": 2437, "epoch": 3 }, { "type": "loss", "content": 0.009877298958599567, "timestamp": "2025-10-01 04:14:03.877840", "step": 2438, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.907977", "step": 2438, "epoch": 3 }, { "type": "loss", "content": 0.023570170626044273, "timestamp": "2025-10-01 04:14:03.910359", "step": 2439, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:03.941080", "step": 2439, "epoch": 3 }, { "type": "loss", "content": 0.003152408404275775, "timestamp": "2025-10-01 04:14:03.965628", "step": 2440, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:03.998071", "step": 2440, "epoch": 3 }, { "type": "loss", "content": 0.006031573750078678, "timestamp": "2025-10-01 04:14:04.000541", "step": 2441, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:04.031851", "step": 2441, "epoch": 3 }, { "type": "loss", "content": 0.005832657217979431, "timestamp": "2025-10-01 04:14:04.034058", "step": 2442, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.065568", "step": 2442, "epoch": 3 }, { "type": "loss", "content": 0.005281724501401186, "timestamp": "2025-10-01 04:14:04.067949", "step": 2443, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:04.097874", "step": 2443, "epoch": 3 }, { "type": "loss", "content": 0.005685365758836269, "timestamp": "2025-10-01 04:14:04.121907", "step": 2444, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.152927", "step": 2444, "epoch": 3 }, { "type": "loss", "content": 0.020365357398986816, "timestamp": "2025-10-01 04:14:04.155486", "step": 2445, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.185968", "step": 2445, "epoch": 3 }, { "type": "loss", "content": 0.0016104949172586203, "timestamp": "2025-10-01 04:14:04.188364", "step": 2446, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.219932", "step": 2446, "epoch": 3 }, { "type": "loss", "content": 0.004023754503577948, "timestamp": "2025-10-01 04:14:04.223340", "step": 2447, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.256884", "step": 2447, "epoch": 3 }, { "type": "loss", "content": 0.0035851493012160063, "timestamp": "2025-10-01 04:14:04.282290", "step": 2448, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.315053", "step": 2448, "epoch": 3 }, { "type": "loss", "content": 0.03891408443450928, "timestamp": "2025-10-01 04:14:04.317928", "step": 2449, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:04.351340", "step": 2449, "epoch": 3 }, { "type": "loss", "content": 0.008011159487068653, "timestamp": "2025-10-01 04:14:04.354461", "step": 2450, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:04.388876", "step": 2450, "epoch": 3 }, { "type": "loss", "content": 0.016046635806560516, "timestamp": "2025-10-01 04:14:04.391874", "step": 2451, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:05.221196", "step": 2451, "epoch": 3 }, { "type": "pplx", "content": 58824101.59490909, "timestamp": "2025-10-01 04:14:05.225001", "step": 2451, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.256821", "step": 2451, "epoch": 3 }, { "type": "loss", "content": 0.019496530294418335, "timestamp": "2025-10-01 04:14:05.288776", "step": 2452, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.347822", "step": 2452, "epoch": 3 }, { "type": "loss", "content": 0.004884731490164995, "timestamp": "2025-10-01 04:14:05.350206", "step": 2453, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.383626", "step": 2453, "epoch": 3 }, { "type": "loss", "content": 0.0024897728580981493, "timestamp": "2025-10-01 04:14:05.386011", "step": 2454, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:05.417936", "step": 2454, "epoch": 3 }, { "type": "loss", "content": 0.0227744672447443, "timestamp": "2025-10-01 04:14:05.421524", "step": 2455, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:05.457406", "step": 2455, "epoch": 3 }, { "type": "loss", "content": 0.001688766060397029, "timestamp": "2025-10-01 04:14:05.482534", "step": 2456, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.514338", "step": 2456, "epoch": 3 }, { "type": "loss", "content": 0.001542889280244708, "timestamp": "2025-10-01 04:14:05.517040", "step": 2457, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.551005", "step": 2457, "epoch": 3 }, { "type": "loss", "content": 0.009548446163535118, "timestamp": "2025-10-01 04:14:05.553160", "step": 2458, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.586414", "step": 2458, "epoch": 3 }, { "type": "loss", "content": 0.004925273358821869, "timestamp": "2025-10-01 04:14:05.588532", "step": 2459, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.619974", "step": 2459, "epoch": 3 }, { "type": "loss", "content": 0.0032642006408423185, "timestamp": "2025-10-01 04:14:05.646415", "step": 2460, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.678964", "step": 2460, "epoch": 3 }, { "type": "loss", "content": 0.0020129564218223095, "timestamp": "2025-10-01 04:14:05.684312", "step": 2461, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.718026", "step": 2461, "epoch": 3 }, { "type": "loss", "content": 0.018269294872879982, "timestamp": "2025-10-01 04:14:05.720987", "step": 2462, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.762885", "step": 2462, "epoch": 3 }, { "type": "loss", "content": 0.02364032343029976, "timestamp": "2025-10-01 04:14:05.766163", "step": 2463, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.807096", "step": 2463, "epoch": 3 }, { "type": "loss", "content": 0.015023179352283478, "timestamp": "2025-10-01 04:14:05.831458", "step": 2464, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.863001", "step": 2464, "epoch": 3 }, { "type": "loss", "content": 0.0021603351924568415, "timestamp": "2025-10-01 04:14:05.865465", "step": 2465, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:05.897938", "step": 2465, "epoch": 3 }, { "type": "loss", "content": 0.0008657817961648107, "timestamp": "2025-10-01 04:14:05.901751", "step": 2466, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.938873", "step": 2466, "epoch": 3 }, { "type": "loss", "content": 0.005796634126454592, "timestamp": "2025-10-01 04:14:05.941195", "step": 2467, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:05.973410", "step": 2467, "epoch": 3 }, { "type": "loss", "content": 0.05339441075921059, "timestamp": "2025-10-01 04:14:05.998674", "step": 2468, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.037270", "step": 2468, "epoch": 3 }, { "type": "loss", "content": 0.003976800944656134, "timestamp": "2025-10-01 04:14:06.040353", "step": 2469, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.074359", "step": 2469, "epoch": 3 }, { "type": "loss", "content": 0.03016142174601555, "timestamp": "2025-10-01 04:14:06.077588", "step": 2470, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.118938", "step": 2470, "epoch": 3 }, { "type": "loss", "content": 0.0010798171861097217, "timestamp": "2025-10-01 04:14:06.121907", "step": 2471, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.154090", "step": 2471, "epoch": 3 }, { "type": "loss", "content": 0.0030930840875953436, "timestamp": "2025-10-01 04:14:06.180904", "step": 2472, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.222815", "step": 2472, "epoch": 3 }, { "type": "loss", "content": 0.026820823550224304, "timestamp": "2025-10-01 04:14:06.226132", "step": 2473, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.261007", "step": 2473, "epoch": 3 }, { "type": "loss", "content": 0.009904695674777031, "timestamp": "2025-10-01 04:14:06.267564", "step": 2474, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.303927", "step": 2474, "epoch": 3 }, { "type": "loss", "content": 0.0008741992642171681, "timestamp": "2025-10-01 04:14:06.307253", "step": 2475, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.343799", "step": 2475, "epoch": 3 }, { "type": "loss", "content": 0.0030690168496221304, "timestamp": "2025-10-01 04:14:06.368367", "step": 2476, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.404554", "step": 2476, "epoch": 3 }, { "type": "loss", "content": 0.0016391616081818938, "timestamp": "2025-10-01 04:14:06.413010", "step": 2477, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.451251", "step": 2477, "epoch": 3 }, { "type": "loss", "content": 0.012302136048674583, "timestamp": "2025-10-01 04:14:06.454209", "step": 2478, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.484875", "step": 2478, "epoch": 3 }, { "type": "loss", "content": 0.0012789281317964196, "timestamp": "2025-10-01 04:14:06.487546", "step": 2479, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.519730", "step": 2479, "epoch": 3 }, { "type": "loss", "content": 0.012554749846458435, "timestamp": "2025-10-01 04:14:06.544387", "step": 2480, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.578237", "step": 2480, "epoch": 3 }, { "type": "loss", "content": 0.01758808270096779, "timestamp": "2025-10-01 04:14:06.580723", "step": 2481, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.612731", "step": 2481, "epoch": 3 }, { "type": "loss", "content": 0.0004258893895894289, "timestamp": "2025-10-01 04:14:06.615496", "step": 2482, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:06.649647", "step": 2482, "epoch": 3 }, { "type": "loss", "content": 0.0005650474922731519, "timestamp": "2025-10-01 04:14:06.652582", "step": 2483, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:06.685697", "step": 2483, "epoch": 3 }, { "type": "loss", "content": 0.00029421233921311796, "timestamp": "2025-10-01 04:14:06.709290", "step": 2484, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.739410", "step": 2484, "epoch": 3 }, { "type": "loss", "content": 0.02421017363667488, "timestamp": "2025-10-01 04:14:06.742135", "step": 2485, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.772500", "step": 2485, "epoch": 3 }, { "type": "loss", "content": 0.013585188426077366, "timestamp": "2025-10-01 04:14:06.774728", "step": 2486, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.804932", "step": 2486, "epoch": 3 }, { "type": "loss", "content": 0.02027386613190174, "timestamp": "2025-10-01 04:14:06.807363", "step": 2487, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.837182", "step": 2487, "epoch": 3 }, { "type": "loss", "content": 0.03536064550280571, "timestamp": "2025-10-01 04:14:06.861548", "step": 2488, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.892325", "step": 2488, "epoch": 3 }, { "type": "loss", "content": 0.027367528527975082, "timestamp": "2025-10-01 04:14:06.894669", "step": 2489, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.924585", "step": 2489, "epoch": 3 }, { "type": "loss", "content": 0.0001808692904887721, "timestamp": "2025-10-01 04:14:06.926917", "step": 2490, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:06.957215", "step": 2490, "epoch": 3 }, { "type": "loss", "content": 9.770004544407129e-05, "timestamp": "2025-10-01 04:14:06.960113", "step": 2491, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:06.991117", "step": 2491, "epoch": 3 }, { "type": "loss", "content": 0.047163479030132294, "timestamp": "2025-10-01 04:14:07.014873", "step": 2492, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:07.045727", "step": 2492, "epoch": 3 }, { "type": "loss", "content": 0.0003407765761949122, "timestamp": "2025-10-01 04:14:07.047852", "step": 2493, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.078555", "step": 2493, "epoch": 3 }, { "type": "loss", "content": 0.0008606308838352561, "timestamp": "2025-10-01 04:14:07.081022", "step": 2494, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.120330", "step": 2494, "epoch": 3 }, { "type": "loss", "content": 0.02153780125081539, "timestamp": "2025-10-01 04:14:07.122917", "step": 2495, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.152750", "step": 2495, "epoch": 3 }, { "type": "loss", "content": 0.0014575996901839972, "timestamp": "2025-10-01 04:14:07.176603", "step": 2496, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.206559", "step": 2496, "epoch": 3 }, { "type": "loss", "content": 0.0022539051715284586, "timestamp": "2025-10-01 04:14:07.209256", "step": 2497, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:07.239927", "step": 2497, "epoch": 3 }, { "type": "loss", "content": 0.0031828132923692465, "timestamp": "2025-10-01 04:14:07.242121", "step": 2498, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.273082", "step": 2498, "epoch": 3 }, { "type": "loss", "content": 0.0009762569679878652, "timestamp": "2025-10-01 04:14:07.275896", "step": 2499, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:07.308529", "step": 2499, "epoch": 3 }, { "type": "loss", "content": 0.001326889730989933, "timestamp": "2025-10-01 04:14:07.332693", "step": 2500, "epoch": 3 }, { "type": "info", "content": "Checkpoint saved at step 2500", "timestamp": "2025-10-01 04:14:12.148171", "step": 2500, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.199408", "step": 2500, "epoch": 3 }, { "type": "loss", "content": 0.002399686025455594, "timestamp": "2025-10-01 04:14:12.202025", "step": 2501, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.234214", "step": 2501, "epoch": 3 }, { "type": "loss", "content": 0.009733153507113457, "timestamp": "2025-10-01 04:14:12.236908", "step": 2502, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.268922", "step": 2502, "epoch": 3 }, { "type": "loss", "content": 0.031686168164014816, "timestamp": "2025-10-01 04:14:12.271423", "step": 2503, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:12.302847", "step": 2503, "epoch": 3 }, { "type": "loss", "content": 0.049481626600027084, "timestamp": "2025-10-01 04:14:12.327083", "step": 2504, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.359654", "step": 2504, "epoch": 3 }, { "type": "loss", "content": 0.0006520474562421441, "timestamp": "2025-10-01 04:14:12.362180", "step": 2505, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.393758", "step": 2505, "epoch": 3 }, { "type": "loss", "content": 0.015490526333451271, "timestamp": "2025-10-01 04:14:12.397216", "step": 2506, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.429563", "step": 2506, "epoch": 3 }, { "type": "loss", "content": 0.003928300458937883, "timestamp": "2025-10-01 04:14:12.431997", "step": 2507, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:12.462731", "step": 2507, "epoch": 3 }, { "type": "loss", "content": 0.031879812479019165, "timestamp": "2025-10-01 04:14:12.488256", "step": 2508, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:13.246351", "step": 2508, "epoch": 3 }, { "type": "pplx", "content": 60019287.10333549, "timestamp": "2025-10-01 04:14:13.248288", "step": 2508, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.276815", "step": 2508, "epoch": 3 }, { "type": "loss", "content": 0.05604308471083641, "timestamp": "2025-10-01 04:14:13.278948", "step": 2509, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.309323", "step": 2509, "epoch": 3 }, { "type": "loss", "content": 0.007913426496088505, "timestamp": "2025-10-01 04:14:13.312382", "step": 2510, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.346529", "step": 2510, "epoch": 3 }, { "type": "loss", "content": 0.00519518880173564, "timestamp": "2025-10-01 04:14:13.349919", "step": 2511, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.385194", "step": 2511, "epoch": 3 }, { "type": "loss", "content": 0.0252819936722517, "timestamp": "2025-10-01 04:14:13.410825", "step": 2512, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.444937", "step": 2512, "epoch": 3 }, { "type": "loss", "content": 0.029163526371121407, "timestamp": "2025-10-01 04:14:13.448372", "step": 2513, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.481925", "step": 2513, "epoch": 3 }, { "type": "loss", "content": 0.003907273523509502, "timestamp": "2025-10-01 04:14:13.485025", "step": 2514, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:13.518343", "step": 2514, "epoch": 3 }, { "type": "loss", "content": 0.012420463375747204, "timestamp": "2025-10-01 04:14:13.521087", "step": 2515, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:13.556710", "step": 2515, "epoch": 3 }, { "type": "loss", "content": 0.01203235238790512, "timestamp": "2025-10-01 04:14:13.581378", "step": 2516, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.613494", "step": 2516, "epoch": 3 }, { "type": "loss", "content": 0.005583175923675299, "timestamp": "2025-10-01 04:14:13.616904", "step": 2517, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.651226", "step": 2517, "epoch": 3 }, { "type": "loss", "content": 0.0028394819237291813, "timestamp": "2025-10-01 04:14:13.654329", "step": 2518, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.690369", "step": 2518, "epoch": 3 }, { "type": "loss", "content": 0.024792209267616272, "timestamp": "2025-10-01 04:14:13.693631", "step": 2519, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.726359", "step": 2519, "epoch": 3 }, { "type": "loss", "content": 0.0031874720007181168, "timestamp": "2025-10-01 04:14:13.751824", "step": 2520, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.784826", "step": 2520, "epoch": 3 }, { "type": "loss", "content": 0.0023829268757253885, "timestamp": "2025-10-01 04:14:13.788388", "step": 2521, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.822803", "step": 2521, "epoch": 3 }, { "type": "loss", "content": 0.0028786410111933947, "timestamp": "2025-10-01 04:14:13.825762", "step": 2522, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.858045", "step": 2522, "epoch": 3 }, { "type": "loss", "content": 0.0209118090569973, "timestamp": "2025-10-01 04:14:13.861200", "step": 2523, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.894897", "step": 2523, "epoch": 3 }, { "type": "loss", "content": 0.005318788345903158, "timestamp": "2025-10-01 04:14:13.918761", "step": 2524, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:13.951190", "step": 2524, "epoch": 3 }, { "type": "loss", "content": 0.007208620198071003, "timestamp": "2025-10-01 04:14:13.954917", "step": 2525, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:13.992272", "step": 2525, "epoch": 3 }, { "type": "loss", "content": 0.004015002399682999, "timestamp": "2025-10-01 04:14:13.996192", "step": 2526, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.030480", "step": 2526, "epoch": 3 }, { "type": "loss", "content": 0.009447134099900723, "timestamp": "2025-10-01 04:14:14.033944", "step": 2527, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.067732", "step": 2527, "epoch": 3 }, { "type": "loss", "content": 0.03116469271481037, "timestamp": "2025-10-01 04:14:14.092425", "step": 2528, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.127600", "step": 2528, "epoch": 3 }, { "type": "loss", "content": 0.0030621045734733343, "timestamp": "2025-10-01 04:14:14.130340", "step": 2529, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:14.163733", "step": 2529, "epoch": 3 }, { "type": "loss", "content": 0.00671646511182189, "timestamp": "2025-10-01 04:14:14.167129", "step": 2530, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.200423", "step": 2530, "epoch": 3 }, { "type": "loss", "content": 0.005486390553414822, "timestamp": "2025-10-01 04:14:14.203687", "step": 2531, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.237212", "step": 2531, "epoch": 3 }, { "type": "loss", "content": 0.0026912540197372437, "timestamp": "2025-10-01 04:14:14.261531", "step": 2532, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.294926", "step": 2532, "epoch": 3 }, { "type": "loss", "content": 0.030198076739907265, "timestamp": "2025-10-01 04:14:14.298234", "step": 2533, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.331095", "step": 2533, "epoch": 3 }, { "type": "loss", "content": 0.02517978474497795, "timestamp": "2025-10-01 04:14:14.334202", "step": 2534, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:14.368731", "step": 2534, "epoch": 3 }, { "type": "loss", "content": 0.00408556591719389, "timestamp": "2025-10-01 04:14:14.371962", "step": 2535, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.404303", "step": 2535, "epoch": 3 }, { "type": "loss", "content": 0.00379009242169559, "timestamp": "2025-10-01 04:14:14.428750", "step": 2536, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.463135", "step": 2536, "epoch": 3 }, { "type": "loss", "content": 0.0170338936150074, "timestamp": "2025-10-01 04:14:14.466208", "step": 2537, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.499354", "step": 2537, "epoch": 3 }, { "type": "loss", "content": 0.01684098318219185, "timestamp": "2025-10-01 04:14:14.502471", "step": 2538, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:14.536295", "step": 2538, "epoch": 3 }, { "type": "loss", "content": 0.014581390656530857, "timestamp": "2025-10-01 04:14:14.540270", "step": 2539, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:14.574547", "step": 2539, "epoch": 3 }, { "type": "loss", "content": 0.0013121498050168157, "timestamp": "2025-10-01 04:14:14.599204", "step": 2540, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.632799", "step": 2540, "epoch": 3 }, { "type": "loss", "content": 0.0016053777653723955, "timestamp": "2025-10-01 04:14:14.636173", "step": 2541, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.671324", "step": 2541, "epoch": 3 }, { "type": "loss", "content": 0.003269800217822194, "timestamp": "2025-10-01 04:14:14.674486", "step": 2542, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.707494", "step": 2542, "epoch": 3 }, { "type": "loss", "content": 0.003128649899736047, "timestamp": "2025-10-01 04:14:14.710926", "step": 2543, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.743966", "step": 2543, "epoch": 3 }, { "type": "loss", "content": 0.03637135028839111, "timestamp": "2025-10-01 04:14:14.768427", "step": 2544, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:14:14.800812", "step": 2544, "epoch": 3 }, { "type": "loss", "content": 0.005533142946660519, "timestamp": "2025-10-01 04:14:14.803153", "step": 2545, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.836575", "step": 2545, "epoch": 3 }, { "type": "loss", "content": 0.0069894432090222836, "timestamp": "2025-10-01 04:14:14.839261", "step": 2546, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.872293", "step": 2546, "epoch": 3 }, { "type": "loss", "content": 0.0031484358478337526, "timestamp": "2025-10-01 04:14:14.875897", "step": 2547, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:14.910486", "step": 2547, "epoch": 3 }, { "type": "loss", "content": 0.002964512910693884, "timestamp": "2025-10-01 04:14:14.935708", "step": 2548, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:14.969640", "step": 2548, "epoch": 3 }, { "type": "loss", "content": 0.013624334707856178, "timestamp": "2025-10-01 04:14:14.973723", "step": 2549, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.005054", "step": 2549, "epoch": 3 }, { "type": "loss", "content": 0.010864500887691975, "timestamp": "2025-10-01 04:14:15.007410", "step": 2550, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.037818", "step": 2550, "epoch": 3 }, { "type": "loss", "content": 0.015076599083840847, "timestamp": "2025-10-01 04:14:15.040439", "step": 2551, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.071172", "step": 2551, "epoch": 3 }, { "type": "loss", "content": 0.0038652902003377676, "timestamp": "2025-10-01 04:14:15.094701", "step": 2552, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.125585", "step": 2552, "epoch": 3 }, { "type": "loss", "content": 0.0028856198769062757, "timestamp": "2025-10-01 04:14:15.128081", "step": 2553, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:15.165898", "step": 2553, "epoch": 3 }, { "type": "loss", "content": 0.021579725667834282, "timestamp": "2025-10-01 04:14:15.168973", "step": 2554, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.200424", "step": 2554, "epoch": 3 }, { "type": "loss", "content": 0.0009834429947659373, "timestamp": "2025-10-01 04:14:15.202996", "step": 2555, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.235241", "step": 2555, "epoch": 3 }, { "type": "loss", "content": 0.0009941781172528863, "timestamp": "2025-10-01 04:14:15.259290", "step": 2556, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.291422", "step": 2556, "epoch": 3 }, { "type": "loss", "content": 0.0052528358064591885, "timestamp": "2025-10-01 04:14:15.294602", "step": 2557, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.326972", "step": 2557, "epoch": 3 }, { "type": "loss", "content": 0.02679816260933876, "timestamp": "2025-10-01 04:14:15.334507", "step": 2558, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.375140", "step": 2558, "epoch": 3 }, { "type": "loss", "content": 0.026580344885587692, "timestamp": "2025-10-01 04:14:15.378071", "step": 2559, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:15.409296", "step": 2559, "epoch": 3 }, { "type": "loss", "content": 0.0031278219539672136, "timestamp": "2025-10-01 04:14:15.433265", "step": 2560, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:15.463964", "step": 2560, "epoch": 3 }, { "type": "loss", "content": 0.010876401327550411, "timestamp": "2025-10-01 04:14:15.466255", "step": 2561, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.496994", "step": 2561, "epoch": 3 }, { "type": "loss", "content": 0.002058443846181035, "timestamp": "2025-10-01 04:14:15.499590", "step": 2562, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.529767", "step": 2562, "epoch": 3 }, { "type": "loss", "content": 0.002466656733304262, "timestamp": "2025-10-01 04:14:15.532093", "step": 2563, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.562601", "step": 2563, "epoch": 3 }, { "type": "loss", "content": 0.013181686401367188, "timestamp": "2025-10-01 04:14:15.586676", "step": 2564, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:15.617457", "step": 2564, "epoch": 3 }, { "type": "loss", "content": 0.005432442296296358, "timestamp": "2025-10-01 04:14:15.619867", "step": 2565, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:16.382678", "step": 2565, "epoch": 3 }, { "type": "pplx", "content": 50190753.812678196, "timestamp": "2025-10-01 04:14:16.384933", "step": 2565, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.414573", "step": 2565, "epoch": 3 }, { "type": "loss", "content": 0.014795447699725628, "timestamp": "2025-10-01 04:14:16.417114", "step": 2566, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:16.448651", "step": 2566, "epoch": 3 }, { "type": "loss", "content": 0.005171437747776508, "timestamp": "2025-10-01 04:14:16.451078", "step": 2567, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:16.482332", "step": 2567, "epoch": 3 }, { "type": "loss", "content": 0.003639362519606948, "timestamp": "2025-10-01 04:14:16.506129", "step": 2568, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:16.537415", "step": 2568, "epoch": 3 }, { "type": "loss", "content": 0.001160087645985186, "timestamp": "2025-10-01 04:14:16.540098", "step": 2569, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.569950", "step": 2569, "epoch": 3 }, { "type": "loss", "content": 0.010151208378374577, "timestamp": "2025-10-01 04:14:16.572165", "step": 2570, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.602491", "step": 2570, "epoch": 3 }, { "type": "loss", "content": 0.011499554850161076, "timestamp": "2025-10-01 04:14:16.604979", "step": 2571, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.635943", "step": 2571, "epoch": 3 }, { "type": "loss", "content": 0.006028040777891874, "timestamp": "2025-10-01 04:14:16.659814", "step": 2572, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.690759", "step": 2572, "epoch": 3 }, { "type": "loss", "content": 0.0026039001531898975, "timestamp": "2025-10-01 04:14:16.693138", "step": 2573, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.727217", "step": 2573, "epoch": 3 }, { "type": "loss", "content": 0.0056630512699484825, "timestamp": "2025-10-01 04:14:16.730655", "step": 2574, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:16.761711", "step": 2574, "epoch": 3 }, { "type": "loss", "content": 0.010089273564517498, "timestamp": "2025-10-01 04:14:16.763937", "step": 2575, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:16.795221", "step": 2575, "epoch": 3 }, { "type": "loss", "content": 0.004652588162571192, "timestamp": "2025-10-01 04:14:16.819081", "step": 2576, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.850124", "step": 2576, "epoch": 3 }, { "type": "loss", "content": 0.003421203466132283, "timestamp": "2025-10-01 04:14:16.852842", "step": 2577, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:16.887000", "step": 2577, "epoch": 3 }, { "type": "loss", "content": 0.014077236875891685, "timestamp": "2025-10-01 04:14:16.889395", "step": 2578, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:16.920436", "step": 2578, "epoch": 3 }, { "type": "loss", "content": 0.0071408068761229515, "timestamp": "2025-10-01 04:14:16.924259", "step": 2579, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:16.955419", "step": 2579, "epoch": 3 }, { "type": "loss", "content": 0.004757278598845005, "timestamp": "2025-10-01 04:14:16.979514", "step": 2580, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.011149", "step": 2580, "epoch": 3 }, { "type": "loss", "content": 0.002742630196735263, "timestamp": "2025-10-01 04:14:17.013407", "step": 2581, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.044745", "step": 2581, "epoch": 3 }, { "type": "loss", "content": 0.004369654227048159, "timestamp": "2025-10-01 04:14:17.047351", "step": 2582, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.078686", "step": 2582, "epoch": 3 }, { "type": "loss", "content": 0.004402398131787777, "timestamp": "2025-10-01 04:14:17.086949", "step": 2583, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.117369", "step": 2583, "epoch": 3 }, { "type": "loss", "content": 0.0065698400139808655, "timestamp": "2025-10-01 04:14:17.141250", "step": 2584, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.173057", "step": 2584, "epoch": 3 }, { "type": "loss", "content": 0.008920601569116116, "timestamp": "2025-10-01 04:14:17.175493", "step": 2585, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.206361", "step": 2585, "epoch": 3 }, { "type": "loss", "content": 0.04373012110590935, "timestamp": "2025-10-01 04:14:17.208825", "step": 2586, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.239396", "step": 2586, "epoch": 3 }, { "type": "loss", "content": 0.005760248750448227, "timestamp": "2025-10-01 04:14:17.241940", "step": 2587, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:17.273584", "step": 2587, "epoch": 3 }, { "type": "loss", "content": 0.0028052874840795994, "timestamp": "2025-10-01 04:14:17.297589", "step": 2588, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:17.328965", "step": 2588, "epoch": 3 }, { "type": "loss", "content": 0.0062300702556967735, "timestamp": "2025-10-01 04:14:17.331347", "step": 2589, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.363434", "step": 2589, "epoch": 3 }, { "type": "loss", "content": 0.010736130177974701, "timestamp": "2025-10-01 04:14:17.365760", "step": 2590, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.399448", "step": 2590, "epoch": 3 }, { "type": "loss", "content": 0.005298420321196318, "timestamp": "2025-10-01 04:14:17.403017", "step": 2591, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.434729", "step": 2591, "epoch": 3 }, { "type": "loss", "content": 0.002143836347386241, "timestamp": "2025-10-01 04:14:17.459706", "step": 2592, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.493614", "step": 2592, "epoch": 3 }, { "type": "loss", "content": 0.0014678857987746596, "timestamp": "2025-10-01 04:14:17.495902", "step": 2593, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.526838", "step": 2593, "epoch": 3 }, { "type": "loss", "content": 0.0037119637709110975, "timestamp": "2025-10-01 04:14:17.529537", "step": 2594, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.561227", "step": 2594, "epoch": 3 }, { "type": "loss", "content": 0.011288485489785671, "timestamp": "2025-10-01 04:14:17.564211", "step": 2595, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 160 ], "flops": 4746299996032 }, "timestamp": "2025-10-01 04:14:17.595353", "step": 2595, "epoch": 3 }, { "type": "loss", "content": 0.02088664285838604, "timestamp": "2025-10-01 04:14:17.619374", "step": 2596, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.650492", "step": 2596, "epoch": 3 }, { "type": "loss", "content": 0.0031742292921990156, "timestamp": "2025-10-01 04:14:17.652633", "step": 2597, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.682786", "step": 2597, "epoch": 3 }, { "type": "loss", "content": 0.0014582558069378138, "timestamp": "2025-10-01 04:14:17.685425", "step": 2598, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.719604", "step": 2598, "epoch": 3 }, { "type": "loss", "content": 0.007793535012751818, "timestamp": "2025-10-01 04:14:17.722033", "step": 2599, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.754084", "step": 2599, "epoch": 3 }, { "type": "loss", "content": 0.002564321970567107, "timestamp": "2025-10-01 04:14:17.777854", "step": 2600, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.809076", "step": 2600, "epoch": 3 }, { "type": "loss", "content": 0.008807080797851086, "timestamp": "2025-10-01 04:14:17.811676", "step": 2601, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.843123", "step": 2601, "epoch": 3 }, { "type": "loss", "content": 0.008557865396142006, "timestamp": "2025-10-01 04:14:17.845710", "step": 2602, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.875868", "step": 2602, "epoch": 3 }, { "type": "loss", "content": 0.0007431748090311885, "timestamp": "2025-10-01 04:14:17.878083", "step": 2603, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:17.909375", "step": 2603, "epoch": 3 }, { "type": "loss", "content": 0.023754892870783806, "timestamp": "2025-10-01 04:14:17.933487", "step": 2604, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:17.966059", "step": 2604, "epoch": 3 }, { "type": "loss", "content": 0.017130501568317413, "timestamp": "2025-10-01 04:14:17.968495", "step": 2605, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:17.999133", "step": 2605, "epoch": 3 }, { "type": "loss", "content": 0.0002764493983704597, "timestamp": "2025-10-01 04:14:18.001793", "step": 2606, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.032473", "step": 2606, "epoch": 3 }, { "type": "loss", "content": 0.021002618595957756, "timestamp": "2025-10-01 04:14:18.036349", "step": 2607, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.068330", "step": 2607, "epoch": 3 }, { "type": "loss", "content": 0.001229580258950591, "timestamp": "2025-10-01 04:14:18.092287", "step": 2608, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:18.121831", "step": 2608, "epoch": 3 }, { "type": "loss", "content": 0.006011900492012501, "timestamp": "2025-10-01 04:14:18.123843", "step": 2609, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:18.156104", "step": 2609, "epoch": 3 }, { "type": "loss", "content": 0.0011188319185748696, "timestamp": "2025-10-01 04:14:18.159077", "step": 2610, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:18.194921", "step": 2610, "epoch": 3 }, { "type": "loss", "content": 0.0035923365503549576, "timestamp": "2025-10-01 04:14:18.197480", "step": 2611, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.228575", "step": 2611, "epoch": 3 }, { "type": "loss", "content": 0.0025855558924376965, "timestamp": "2025-10-01 04:14:18.255153", "step": 2612, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:18.292945", "step": 2612, "epoch": 3 }, { "type": "loss", "content": 0.04813668876886368, "timestamp": "2025-10-01 04:14:18.295171", "step": 2613, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.326638", "step": 2613, "epoch": 3 }, { "type": "loss", "content": 0.00044949265429750085, "timestamp": "2025-10-01 04:14:18.329294", "step": 2614, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.360404", "step": 2614, "epoch": 3 }, { "type": "loss", "content": 0.0025543272495269775, "timestamp": "2025-10-01 04:14:18.362947", "step": 2615, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:18.395636", "step": 2615, "epoch": 3 }, { "type": "loss", "content": 0.0008481559343636036, "timestamp": "2025-10-01 04:14:18.419708", "step": 2616, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.451499", "step": 2616, "epoch": 3 }, { "type": "loss", "content": 0.008253288455307484, "timestamp": "2025-10-01 04:14:18.455312", "step": 2617, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:18.489097", "step": 2617, "epoch": 3 }, { "type": "loss", "content": 0.0006239612703211606, "timestamp": "2025-10-01 04:14:18.492523", "step": 2618, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.524474", "step": 2618, "epoch": 3 }, { "type": "loss", "content": 0.00023207686899695545, "timestamp": "2025-10-01 04:14:18.527162", "step": 2619, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:18.558301", "step": 2619, "epoch": 3 }, { "type": "loss", "content": 4.780263043357991e-05, "timestamp": "2025-10-01 04:14:18.582702", "step": 2620, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.615807", "step": 2620, "epoch": 3 }, { "type": "loss", "content": 2.830891935445834e-05, "timestamp": "2025-10-01 04:14:18.617963", "step": 2621, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:18.648595", "step": 2621, "epoch": 3 }, { "type": "loss", "content": 0.00848412699997425, "timestamp": "2025-10-01 04:14:18.651200", "step": 2622, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:19.403155", "step": 2622, "epoch": 3 }, { "type": "pplx", "content": 63792326.1143382, "timestamp": "2025-10-01 04:14:19.405245", "step": 2622, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.434615", "step": 2622, "epoch": 3 }, { "type": "loss", "content": 0.0022637471556663513, "timestamp": "2025-10-01 04:14:19.438355", "step": 2623, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:19.469816", "step": 2623, "epoch": 3 }, { "type": "loss", "content": 3.309018211439252e-05, "timestamp": "2025-10-01 04:14:19.493957", "step": 2624, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.524313", "step": 2624, "epoch": 3 }, { "type": "loss", "content": 0.002056579804047942, "timestamp": "2025-10-01 04:14:19.528433", "step": 2625, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.561793", "step": 2625, "epoch": 3 }, { "type": "loss", "content": 0.01622423343360424, "timestamp": "2025-10-01 04:14:19.564654", "step": 2626, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.595641", "step": 2626, "epoch": 3 }, { "type": "loss", "content": 0.00038660448626615107, "timestamp": "2025-10-01 04:14:19.599181", "step": 2627, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.629235", "step": 2627, "epoch": 3 }, { "type": "loss", "content": 0.008610601536929607, "timestamp": "2025-10-01 04:14:19.653427", "step": 2628, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.687698", "step": 2628, "epoch": 3 }, { "type": "loss", "content": 0.0007182090193964541, "timestamp": "2025-10-01 04:14:19.690621", "step": 2629, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.724730", "step": 2629, "epoch": 3 }, { "type": "loss", "content": 0.00033229749533347785, "timestamp": "2025-10-01 04:14:19.728034", "step": 2630, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.758266", "step": 2630, "epoch": 3 }, { "type": "loss", "content": 0.000259073858615011, "timestamp": "2025-10-01 04:14:19.760804", "step": 2631, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:19.794734", "step": 2631, "epoch": 3 }, { "type": "loss", "content": 0.004574598278850317, "timestamp": "2025-10-01 04:14:19.819707", "step": 2632, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.850263", "step": 2632, "epoch": 3 }, { "type": "loss", "content": 0.00040219916263595223, "timestamp": "2025-10-01 04:14:19.853064", "step": 2633, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.884737", "step": 2633, "epoch": 3 }, { "type": "loss", "content": 0.0024398225359618664, "timestamp": "2025-10-01 04:14:19.887181", "step": 2634, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:19.917698", "step": 2634, "epoch": 3 }, { "type": "loss", "content": 0.00624752277508378, "timestamp": "2025-10-01 04:14:19.920816", "step": 2635, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:19.952694", "step": 2635, "epoch": 3 }, { "type": "loss", "content": 0.028992334380745888, "timestamp": "2025-10-01 04:14:19.976994", "step": 2636, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.008692", "step": 2636, "epoch": 3 }, { "type": "loss", "content": 0.004315625410526991, "timestamp": "2025-10-01 04:14:20.011478", "step": 2637, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.043629", "step": 2637, "epoch": 3 }, { "type": "loss", "content": 0.0007517217891290784, "timestamp": "2025-10-01 04:14:20.046281", "step": 2638, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.076944", "step": 2638, "epoch": 3 }, { "type": "loss", "content": 0.004551793914288282, "timestamp": "2025-10-01 04:14:20.079535", "step": 2639, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.110600", "step": 2639, "epoch": 3 }, { "type": "loss", "content": 0.002312313299626112, "timestamp": "2025-10-01 04:14:20.134985", "step": 2640, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.166039", "step": 2640, "epoch": 3 }, { "type": "loss", "content": 0.0001878144685178995, "timestamp": "2025-10-01 04:14:20.168931", "step": 2641, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.199235", "step": 2641, "epoch": 3 }, { "type": "loss", "content": 0.0020596140529960394, "timestamp": "2025-10-01 04:14:20.202099", "step": 2642, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:20.233575", "step": 2642, "epoch": 3 }, { "type": "loss", "content": 0.007440303452312946, "timestamp": "2025-10-01 04:14:20.237761", "step": 2643, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:20.269287", "step": 2643, "epoch": 3 }, { "type": "loss", "content": 0.00196535955183208, "timestamp": "2025-10-01 04:14:20.295526", "step": 2644, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.326685", "step": 2644, "epoch": 3 }, { "type": "loss", "content": 0.0003387908509466797, "timestamp": "2025-10-01 04:14:20.329188", "step": 2645, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.361934", "step": 2645, "epoch": 3 }, { "type": "loss", "content": 0.0007078064372763038, "timestamp": "2025-10-01 04:14:20.365156", "step": 2646, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.397017", "step": 2646, "epoch": 3 }, { "type": "loss", "content": 0.018304944038391113, "timestamp": "2025-10-01 04:14:20.399560", "step": 2647, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.431741", "step": 2647, "epoch": 3 }, { "type": "loss", "content": 0.0036956293042749166, "timestamp": "2025-10-01 04:14:20.455468", "step": 2648, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.487426", "step": 2648, "epoch": 3 }, { "type": "loss", "content": 0.0026661923620849848, "timestamp": "2025-10-01 04:14:20.490062", "step": 2649, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.520600", "step": 2649, "epoch": 3 }, { "type": "loss", "content": 0.0004781464231200516, "timestamp": "2025-10-01 04:14:20.523290", "step": 2650, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.557150", "step": 2650, "epoch": 3 }, { "type": "loss", "content": 0.0003797627578023821, "timestamp": "2025-10-01 04:14:20.560244", "step": 2651, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.594731", "step": 2651, "epoch": 3 }, { "type": "loss", "content": 0.000620409962721169, "timestamp": "2025-10-01 04:14:20.618941", "step": 2652, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.650015", "step": 2652, "epoch": 3 }, { "type": "loss", "content": 0.0004895195597782731, "timestamp": "2025-10-01 04:14:20.652324", "step": 2653, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.683946", "step": 2653, "epoch": 3 }, { "type": "loss", "content": 0.011492596007883549, "timestamp": "2025-10-01 04:14:20.688089", "step": 2654, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:20.721129", "step": 2654, "epoch": 3 }, { "type": "loss", "content": 0.03194532170891762, "timestamp": "2025-10-01 04:14:20.723852", "step": 2655, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.754910", "step": 2655, "epoch": 3 }, { "type": "loss", "content": 0.0101211192086339, "timestamp": "2025-10-01 04:14:20.779419", "step": 2656, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.810669", "step": 2656, "epoch": 3 }, { "type": "loss", "content": 0.00036434808862395585, "timestamp": "2025-10-01 04:14:20.812945", "step": 2657, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:20.844970", "step": 2657, "epoch": 3 }, { "type": "loss", "content": 0.0037049304228276014, "timestamp": "2025-10-01 04:14:20.847690", "step": 2658, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:20.877959", "step": 2658, "epoch": 3 }, { "type": "loss", "content": 0.00512391421943903, "timestamp": "2025-10-01 04:14:20.880870", "step": 2659, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:20.911538", "step": 2659, "epoch": 3 }, { "type": "loss", "content": 0.00954340398311615, "timestamp": "2025-10-01 04:14:20.935517", "step": 2660, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.966641", "step": 2660, "epoch": 3 }, { "type": "loss", "content": 0.0002692260022740811, "timestamp": "2025-10-01 04:14:20.969091", "step": 2661, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:20.999909", "step": 2661, "epoch": 3 }, { "type": "loss", "content": 0.0017530341865494847, "timestamp": "2025-10-01 04:14:21.002738", "step": 2662, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.034472", "step": 2662, "epoch": 3 }, { "type": "loss", "content": 0.00228483765386045, "timestamp": "2025-10-01 04:14:21.037543", "step": 2663, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.067825", "step": 2663, "epoch": 3 }, { "type": "loss", "content": 0.02074584737420082, "timestamp": "2025-10-01 04:14:21.091774", "step": 2664, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.122398", "step": 2664, "epoch": 3 }, { "type": "loss", "content": 0.0003323450800962746, "timestamp": "2025-10-01 04:14:21.125973", "step": 2665, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.157028", "step": 2665, "epoch": 3 }, { "type": "loss", "content": 0.00015716443886049092, "timestamp": "2025-10-01 04:14:21.159907", "step": 2666, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.192533", "step": 2666, "epoch": 3 }, { "type": "loss", "content": 0.0005555102252401412, "timestamp": "2025-10-01 04:14:21.194946", "step": 2667, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:21.226412", "step": 2667, "epoch": 3 }, { "type": "loss", "content": 0.0007580574601888657, "timestamp": "2025-10-01 04:14:21.250773", "step": 2668, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:21.281355", "step": 2668, "epoch": 3 }, { "type": "loss", "content": 4.993396214558743e-05, "timestamp": "2025-10-01 04:14:21.284168", "step": 2669, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.314955", "step": 2669, "epoch": 3 }, { "type": "loss", "content": 0.008731653913855553, "timestamp": "2025-10-01 04:14:21.318030", "step": 2670, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:21.349832", "step": 2670, "epoch": 3 }, { "type": "loss", "content": 0.0017039062222465873, "timestamp": "2025-10-01 04:14:21.353765", "step": 2671, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.384203", "step": 2671, "epoch": 3 }, { "type": "loss", "content": 0.0005672592669725418, "timestamp": "2025-10-01 04:14:21.408231", "step": 2672, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.439187", "step": 2672, "epoch": 3 }, { "type": "loss", "content": 0.004365603905171156, "timestamp": "2025-10-01 04:14:21.441872", "step": 2673, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:21.478691", "step": 2673, "epoch": 3 }, { "type": "loss", "content": 0.00015389944019261748, "timestamp": "2025-10-01 04:14:21.481530", "step": 2674, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.513860", "step": 2674, "epoch": 3 }, { "type": "loss", "content": 0.0008061733096837997, "timestamp": "2025-10-01 04:14:21.516433", "step": 2675, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.547673", "step": 2675, "epoch": 3 }, { "type": "loss", "content": 0.0013293975498527288, "timestamp": "2025-10-01 04:14:21.571717", "step": 2676, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.602273", "step": 2676, "epoch": 3 }, { "type": "loss", "content": 0.001839006319642067, "timestamp": "2025-10-01 04:14:21.604901", "step": 2677, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:21.636975", "step": 2677, "epoch": 3 }, { "type": "loss", "content": 0.002645869040861726, "timestamp": "2025-10-01 04:14:21.639764", "step": 2678, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:21.670532", "step": 2678, "epoch": 3 }, { "type": "loss", "content": 0.00023593958758283406, "timestamp": "2025-10-01 04:14:21.673214", "step": 2679, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:22.439115", "step": 2679, "epoch": 3 }, { "type": "pplx", "content": 64678083.10080412, "timestamp": "2025-10-01 04:14:22.445683", "step": 2679, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:22.475693", "step": 2679, "epoch": 3 }, { "type": "loss", "content": 2.630149720062036e-05, "timestamp": "2025-10-01 04:14:22.499577", "step": 2680, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.531997", "step": 2680, "epoch": 3 }, { "type": "loss", "content": 0.00016436410078313202, "timestamp": "2025-10-01 04:14:22.534194", "step": 2681, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:22.570502", "step": 2681, "epoch": 3 }, { "type": "loss", "content": 0.00018075850675813854, "timestamp": "2025-10-01 04:14:22.573125", "step": 2682, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:22.605089", "step": 2682, "epoch": 3 }, { "type": "loss", "content": 0.00015388346218969673, "timestamp": "2025-10-01 04:14:22.611991", "step": 2683, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.654972", "step": 2683, "epoch": 3 }, { "type": "loss", "content": 5.065502045908943e-05, "timestamp": "2025-10-01 04:14:22.679014", "step": 2684, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.709670", "step": 2684, "epoch": 3 }, { "type": "loss", "content": 9.08417787286453e-05, "timestamp": "2025-10-01 04:14:22.712393", "step": 2685, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:22.743437", "step": 2685, "epoch": 3 }, { "type": "loss", "content": 0.0006787800812162459, "timestamp": "2025-10-01 04:14:22.747002", "step": 2686, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.778437", "step": 2686, "epoch": 3 }, { "type": "loss", "content": 0.0005651679239235818, "timestamp": "2025-10-01 04:14:22.780923", "step": 2687, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:22.814151", "step": 2687, "epoch": 3 }, { "type": "loss", "content": 0.00019555458857212216, "timestamp": "2025-10-01 04:14:22.838244", "step": 2688, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.871133", "step": 2688, "epoch": 3 }, { "type": "loss", "content": 1.657810207689181e-05, "timestamp": "2025-10-01 04:14:22.873991", "step": 2689, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.905790", "step": 2689, "epoch": 3 }, { "type": "loss", "content": 0.0001516373740741983, "timestamp": "2025-10-01 04:14:22.908134", "step": 2690, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:22.945389", "step": 2690, "epoch": 3 }, { "type": "loss", "content": 0.00043833054951392114, "timestamp": "2025-10-01 04:14:22.948425", "step": 2691, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:22.980571", "step": 2691, "epoch": 3 }, { "type": "loss", "content": 0.00034414706169627607, "timestamp": "2025-10-01 04:14:23.005092", "step": 2692, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.036093", "step": 2692, "epoch": 3 }, { "type": "loss", "content": 0.0003396179818082601, "timestamp": "2025-10-01 04:14:23.038656", "step": 2693, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.069169", "step": 2693, "epoch": 3 }, { "type": "loss", "content": 0.00028526881942525506, "timestamp": "2025-10-01 04:14:23.071806", "step": 2694, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.102330", "step": 2694, "epoch": 3 }, { "type": "loss", "content": 0.00022634358901996166, "timestamp": "2025-10-01 04:14:23.104959", "step": 2695, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:23.135372", "step": 2695, "epoch": 3 }, { "type": "loss", "content": 0.0003560289624147117, "timestamp": "2025-10-01 04:14:23.159269", "step": 2696, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.189584", "step": 2696, "epoch": 3 }, { "type": "loss", "content": 2.8566690161824226e-05, "timestamp": "2025-10-01 04:14:23.192495", "step": 2697, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.223438", "step": 2697, "epoch": 3 }, { "type": "loss", "content": 0.00014089918113313615, "timestamp": "2025-10-01 04:14:23.225946", "step": 2698, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.256127", "step": 2698, "epoch": 3 }, { "type": "loss", "content": 0.00024118837609421462, "timestamp": "2025-10-01 04:14:23.258490", "step": 2699, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:23.289144", "step": 2699, "epoch": 3 }, { "type": "loss", "content": 0.0001321942690992728, "timestamp": "2025-10-01 04:14:23.313059", "step": 2700, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.343977", "step": 2700, "epoch": 3 }, { "type": "loss", "content": 0.0001804428466130048, "timestamp": "2025-10-01 04:14:23.346306", "step": 2701, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.377256", "step": 2701, "epoch": 3 }, { "type": "loss", "content": 5.039268216933124e-05, "timestamp": "2025-10-01 04:14:23.380200", "step": 2702, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:23.410924", "step": 2702, "epoch": 3 }, { "type": "loss", "content": 0.000560373708140105, "timestamp": "2025-10-01 04:14:23.414103", "step": 2703, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.448321", "step": 2703, "epoch": 3 }, { "type": "loss", "content": 0.02016831561923027, "timestamp": "2025-10-01 04:14:23.473051", "step": 2704, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.505733", "step": 2704, "epoch": 3 }, { "type": "loss", "content": 7.694788655498996e-05, "timestamp": "2025-10-01 04:14:23.509563", "step": 2705, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.543125", "step": 2705, "epoch": 3 }, { "type": "loss", "content": 3.4393237001495436e-05, "timestamp": "2025-10-01 04:14:23.546537", "step": 2706, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.580126", "step": 2706, "epoch": 3 }, { "type": "loss", "content": 0.00016384324408136308, "timestamp": "2025-10-01 04:14:23.583589", "step": 2707, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.625307", "step": 2707, "epoch": 3 }, { "type": "loss", "content": 0.0013702316209673882, "timestamp": "2025-10-01 04:14:23.649747", "step": 2708, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.683137", "step": 2708, "epoch": 3 }, { "type": "loss", "content": 0.0005768322153016925, "timestamp": "2025-10-01 04:14:23.686257", "step": 2709, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.718163", "step": 2709, "epoch": 3 }, { "type": "loss", "content": 0.002737879054620862, "timestamp": "2025-10-01 04:14:23.720987", "step": 2710, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:23.755528", "step": 2710, "epoch": 3 }, { "type": "loss", "content": 5.805503678857349e-05, "timestamp": "2025-10-01 04:14:23.759385", "step": 2711, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.800299", "step": 2711, "epoch": 3 }, { "type": "loss", "content": 1.628149584576022e-05, "timestamp": "2025-10-01 04:14:23.825425", "step": 2712, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.859178", "step": 2712, "epoch": 3 }, { "type": "loss", "content": 0.0020005072001367807, "timestamp": "2025-10-01 04:14:23.862185", "step": 2713, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:23.894207", "step": 2713, "epoch": 3 }, { "type": "loss", "content": 0.09073884040117264, "timestamp": "2025-10-01 04:14:23.897399", "step": 2714, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.929788", "step": 2714, "epoch": 3 }, { "type": "loss", "content": 0.04919225350022316, "timestamp": "2025-10-01 04:14:23.933008", "step": 2715, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:23.965571", "step": 2715, "epoch": 3 }, { "type": "loss", "content": 0.01102256216108799, "timestamp": "2025-10-01 04:14:23.990031", "step": 2716, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.023163", "step": 2716, "epoch": 3 }, { "type": "loss", "content": 0.014040348120033741, "timestamp": "2025-10-01 04:14:24.026486", "step": 2717, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.062599", "step": 2717, "epoch": 3 }, { "type": "loss", "content": 2.2828298824606463e-05, "timestamp": "2025-10-01 04:14:24.066467", "step": 2718, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.098871", "step": 2718, "epoch": 3 }, { "type": "loss", "content": 0.0004682219005189836, "timestamp": "2025-10-01 04:14:24.101749", "step": 2719, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.134718", "step": 2719, "epoch": 3 }, { "type": "loss", "content": 0.021064382046461105, "timestamp": "2025-10-01 04:14:24.160136", "step": 2720, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.192876", "step": 2720, "epoch": 3 }, { "type": "loss", "content": 0.0023182774893939495, "timestamp": "2025-10-01 04:14:24.195912", "step": 2721, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.228385", "step": 2721, "epoch": 3 }, { "type": "loss", "content": 9.966571815311909e-05, "timestamp": "2025-10-01 04:14:24.231945", "step": 2722, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.265017", "step": 2722, "epoch": 3 }, { "type": "loss", "content": 0.036073338240385056, "timestamp": "2025-10-01 04:14:24.268566", "step": 2723, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.300716", "step": 2723, "epoch": 3 }, { "type": "loss", "content": 0.0005764077068306506, "timestamp": "2025-10-01 04:14:24.324838", "step": 2724, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.358191", "step": 2724, "epoch": 3 }, { "type": "loss", "content": 0.004021498374640942, "timestamp": "2025-10-01 04:14:24.361235", "step": 2725, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.395162", "step": 2725, "epoch": 3 }, { "type": "loss", "content": 0.0007272333605214953, "timestamp": "2025-10-01 04:14:24.398432", "step": 2726, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.430728", "step": 2726, "epoch": 3 }, { "type": "loss", "content": 0.0004156237991992384, "timestamp": "2025-10-01 04:14:24.433363", "step": 2727, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.465723", "step": 2727, "epoch": 3 }, { "type": "loss", "content": 0.004409321118146181, "timestamp": "2025-10-01 04:14:24.491592", "step": 2728, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.525686", "step": 2728, "epoch": 3 }, { "type": "loss", "content": 0.00032090378226712346, "timestamp": "2025-10-01 04:14:24.528148", "step": 2729, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.559962", "step": 2729, "epoch": 3 }, { "type": "loss", "content": 0.005059172865003347, "timestamp": "2025-10-01 04:14:24.562831", "step": 2730, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.599559", "step": 2730, "epoch": 3 }, { "type": "loss", "content": 0.0022533093579113483, "timestamp": "2025-10-01 04:14:24.602996", "step": 2731, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.637850", "step": 2731, "epoch": 3 }, { "type": "loss", "content": 0.0028393480461090803, "timestamp": "2025-10-01 04:14:24.662388", "step": 2732, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.696061", "step": 2732, "epoch": 3 }, { "type": "loss", "content": 0.0021085678599774837, "timestamp": "2025-10-01 04:14:24.699093", "step": 2733, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:24.732334", "step": 2733, "epoch": 3 }, { "type": "loss", "content": 0.004252140875905752, "timestamp": "2025-10-01 04:14:24.735370", "step": 2734, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.768097", "step": 2734, "epoch": 3 }, { "type": "loss", "content": 0.013400801457464695, "timestamp": "2025-10-01 04:14:24.771066", "step": 2735, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:24.805823", "step": 2735, "epoch": 3 }, { "type": "loss", "content": 0.018713532015681267, "timestamp": "2025-10-01 04:14:24.830526", "step": 2736, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:25.644065", "step": 2736, "epoch": 3 }, { "type": "pplx", "content": 68029215.63403933, "timestamp": "2025-10-01 04:14:25.645970", "step": 2736, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.674612", "step": 2736, "epoch": 3 }, { "type": "loss", "content": 0.0023472080938518047, "timestamp": "2025-10-01 04:14:25.676892", "step": 2737, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.708452", "step": 2737, "epoch": 3 }, { "type": "loss", "content": 0.009190792217850685, "timestamp": "2025-10-01 04:14:25.711120", "step": 2738, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.741240", "step": 2738, "epoch": 3 }, { "type": "loss", "content": 0.015830060467123985, "timestamp": "2025-10-01 04:14:25.743599", "step": 2739, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.773643", "step": 2739, "epoch": 3 }, { "type": "loss", "content": 0.008671787567436695, "timestamp": "2025-10-01 04:14:25.797839", "step": 2740, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:25.828459", "step": 2740, "epoch": 3 }, { "type": "loss", "content": 0.008564976043999195, "timestamp": "2025-10-01 04:14:25.830663", "step": 2741, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.861727", "step": 2741, "epoch": 3 }, { "type": "loss", "content": 0.002389447530731559, "timestamp": "2025-10-01 04:14:25.863992", "step": 2742, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:25.893764", "step": 2742, "epoch": 3 }, { "type": "loss", "content": 0.01760433055460453, "timestamp": "2025-10-01 04:14:25.896777", "step": 2743, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:25.927483", "step": 2743, "epoch": 3 }, { "type": "loss", "content": 0.019632836803793907, "timestamp": "2025-10-01 04:14:25.951270", "step": 2744, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:25.981671", "step": 2744, "epoch": 3 }, { "type": "loss", "content": 0.000977389863692224, "timestamp": "2025-10-01 04:14:25.983860", "step": 2745, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:26.014891", "step": 2745, "epoch": 3 }, { "type": "loss", "content": 0.0004742510209325701, "timestamp": "2025-10-01 04:14:26.017322", "step": 2746, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:26.049127", "step": 2746, "epoch": 3 }, { "type": "loss", "content": 0.0005396570777520537, "timestamp": "2025-10-01 04:14:26.052329", "step": 2747, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 112 ], "flops": 3322488817984 }, "timestamp": "2025-10-01 04:14:26.083573", "step": 2747, "epoch": 3 }, { "type": "loss", "content": 0.001477371552027762, "timestamp": "2025-10-01 04:14:26.107906", "step": 2748, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:26.139308", "step": 2748, "epoch": 3 }, { "type": "loss", "content": 0.0005041286931373179, "timestamp": "2025-10-01 04:14:26.141655", "step": 2749, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 128 ], "flops": 3797092544000 }, "timestamp": "2025-10-01 04:14:26.172138", "step": 2749, "epoch": 3 }, { "type": "loss", "content": 0.0012628821423277259, "timestamp": "2025-10-01 04:14:26.174280", "step": 2750, "epoch": 3 }, { "type": "flops", "content": { "type": "train", "batch_dim": [ 4, 144 ], "flops": 4271696270016 }, "timestamp": "2025-10-01 04:14:26.204653", "step": 2750, "epoch": 3 }, { "type": "loss", "content": 0.0002868495066650212, "timestamp": "2025-10-01 04:14:26.207392", "step": 2751, "epoch": 3 }, { "type": "flops", "content": [ { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 112 ], "batch_size": 8, "flops": 2214805229440 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 80 ], "batch_size": 8, "flops": 1582003754624 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 }, { "type": "perplexity", "in_batch_dim": [ 8, 96 ], "batch_size": 8, "flops": 1898404492032 } ], "timestamp": "2025-10-01 04:14:26.953400", "step": 2751, "epoch": 3 }, { "type": "pplx", "content": 38842487.79769442, "timestamp": "2025-10-01 04:14:26.955402", "step": 2751, "epoch": 3 }, { "type": "best_pplx", "content": 38842487.79769442, "timestamp": "2025-10-01 04:14:26.956761", "step": 2751, "epoch": 3 }, { "type": "best_step", "content": 2751, "timestamp": "2025-10-01 04:14:26.958432", "step": 2751, "epoch": 3 }, { "type": "total_pplx_flops", "content": 5014951860256000, "timestamp": "2025-10-01 04:14:26.959981", "step": 2751, "epoch": 3 }, { "type": "total_train_flops", "content": 10640863719936576, "timestamp": "2025-10-01 04:14:26.961873", "step": 2751, "epoch": 3 } ], "best_evals": { "pplx": { "score": 38842487.79769442, "step": 2751 }, "rougel": { "precision": 0.8382352941176471, "recall": 0.8382352941176471, "fmeasure": 0.8382352941176471 } } }